# import all the python packages that will be needed. If any package is missing, install it first using this command:
# !pip install <package name>
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt # seaborn is based on matplotlib
import matplotlib.ticker as tkr
sns.set(color_codes=True) # adds a nice background to the graphs
%matplotlib inline
from pandas_profiling import ProfileReport
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import (
train_test_split,
) # Sklearn package's randomized data splitting function
import warnings
warnings.filterwarnings("ignore") # During the final run, warnings can be disabled.
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
# To fill the missing values
from sklearn.impute import SimpleImputer
# To add attributes based on the zip code
import uszipcode
from uszipcode import SearchEngine
import nb_black
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
precision_recall_curve,
roc_curve,
make_scorer,
)
# Sequential feature selector is present in mlxtend library
#!pip install mlxtend
# to install mlxtent library
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# to plot the performance with addition of each feature
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
# To tune different models
from sklearn.model_selection import GridSearchCV
%load_ext lab_black
The lab_black extension is already loaded. To reload it, use: %reload_ext lab_black
# Read in the data file. Get the row/column counts. See a few rows to make sure the reading was done correctly
url = "Loan_Modelling.csv"
lmData = pd.read_csv(url)
print("The dataset has", lmData.shape[0], "tuples and", lmData.shape[1], "attributes.")
print("")
print(lmData.info())
print("")
lmData.head()
The dataset has 5000 tuples and 14 attributes. <class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 Experience 5000 non-null int64 3 Income 5000 non-null int64 4 ZIPCode 5000 non-null int64 5 Family 5000 non-null int64 6 CCAvg 5000 non-null float64 7 Education 5000 non-null int64 8 Mortgage 5000 non-null int64 9 Personal_Loan 5000 non-null int64 10 Securities_Account 5000 non-null int64 11 CD_Account 5000 non-null int64 12 Online 5000 non-null int64 13 CreditCard 5000 non-null int64 dtypes: float64(1), int64(13) memory usage: 547.0 KB None
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
# Let's keep a copy of the original dataset so that we will have access to it if we do field transformations on the main dataframe.
lmdata_orig = pd.read_csv(url)
# We have enough number of rows and hence the training/test samples will be large enough to build a logistic regression and decision tree model.
# The id field can be dropped.
# Data types above are as expected.
# Let's see some basic stats (mean, min, max, median etc.) of the columns. For non-numeric columns, not all metrics make sense. These will show up as NaN.
lmData.describe(include="all").T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 5000.0 | 2500.500000 | 1443.520003 | 1.0 | 1250.75 | 2500.5 | 3750.25 | 5000.0 |
| Age | 5000.0 | 45.338400 | 11.463166 | 23.0 | 35.00 | 45.0 | 55.00 | 67.0 |
| Experience | 5000.0 | 20.104600 | 11.467954 | -3.0 | 10.00 | 20.0 | 30.00 | 43.0 |
| Income | 5000.0 | 73.774200 | 46.033729 | 8.0 | 39.00 | 64.0 | 98.00 | 224.0 |
| ZIPCode | 5000.0 | 93169.257000 | 1759.455086 | 90005.0 | 91911.00 | 93437.0 | 94608.00 | 96651.0 |
| Family | 5000.0 | 2.396400 | 1.147663 | 1.0 | 1.00 | 2.0 | 3.00 | 4.0 |
| CCAvg | 5000.0 | 1.937938 | 1.747659 | 0.0 | 0.70 | 1.5 | 2.50 | 10.0 |
| Education | 5000.0 | 1.881000 | 0.839869 | 1.0 | 1.00 | 2.0 | 3.00 | 3.0 |
| Mortgage | 5000.0 | 56.498800 | 101.713802 | 0.0 | 0.00 | 0.0 | 101.00 | 635.0 |
| Personal_Loan | 5000.0 | 0.096000 | 0.294621 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Securities_Account | 5000.0 | 0.104400 | 0.305809 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| CD_Account | 5000.0 | 0.060400 | 0.238250 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Online | 5000.0 | 0.596800 | 0.490589 | 0.0 | 0.00 | 1.0 | 1.00 | 1.0 |
| CreditCard | 5000.0 | 0.294000 | 0.455637 | 0.0 | 0.00 | 0.0 | 1.00 | 1.0 |
# Let's check if there are any blanks
lmData.isnull().sum()
ID 0 Age 0 Experience 0 Income 0 ZIPCode 0 Family 0 CCAvg 0 Education 0 Mortgage 0 Personal_Loan 0 Securities_Account 0 CD_Account 0 Online 0 CreditCard 0 dtype: int64
# Let's use the pandas profile package to create a basic report on the dataset
# The profile report is in a cell with its own scrolling. So scroll within the cell to see the entire report.
profile = ProfileReport(lmData)
profile
# Save the report file as an HTML for future viewing
profile.to_file("cdata_profile3.html")
# Experience attribute has a minimum value of -3. This is strange. Experiennce can be 0. But what does a negative value mean?
# Let's look at such rows.
lmData[lmData["Experience"] < 0]
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 89 | 90 | 25 | -1 | 113 | 94303 | 4 | 2.30 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 226 | 227 | 24 | -1 | 39 | 94085 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 315 | 316 | 24 | -2 | 51 | 90630 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 451 | 452 | 28 | -2 | 48 | 94132 | 2 | 1.75 | 3 | 89 | 0 | 0 | 0 | 1 | 0 |
| 524 | 525 | 24 | -1 | 75 | 93014 | 4 | 0.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 536 | 537 | 25 | -1 | 43 | 92173 | 3 | 2.40 | 2 | 176 | 0 | 0 | 0 | 1 | 0 |
| 540 | 541 | 25 | -1 | 109 | 94010 | 4 | 2.30 | 3 | 314 | 0 | 0 | 0 | 1 | 0 |
| 576 | 577 | 25 | -1 | 48 | 92870 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 583 | 584 | 24 | -1 | 38 | 95045 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 597 | 598 | 24 | -2 | 125 | 92835 | 2 | 7.20 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 649 | 650 | 25 | -1 | 82 | 92677 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 670 | 671 | 23 | -1 | 61 | 92374 | 4 | 2.60 | 1 | 239 | 0 | 0 | 0 | 1 | 0 |
| 686 | 687 | 24 | -1 | 38 | 92612 | 4 | 0.60 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 793 | 794 | 24 | -2 | 150 | 94720 | 2 | 2.00 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 889 | 890 | 24 | -2 | 82 | 91103 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 1 |
| 909 | 910 | 23 | -1 | 149 | 91709 | 1 | 6.33 | 1 | 305 | 0 | 0 | 0 | 0 | 1 |
| 1173 | 1174 | 24 | -1 | 35 | 94305 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1428 | 1429 | 25 | -1 | 21 | 94583 | 4 | 0.40 | 1 | 90 | 0 | 0 | 0 | 1 | 0 |
| 1522 | 1523 | 25 | -1 | 101 | 94720 | 4 | 2.30 | 3 | 256 | 0 | 0 | 0 | 0 | 1 |
| 1905 | 1906 | 25 | -1 | 112 | 92507 | 2 | 2.00 | 1 | 241 | 0 | 0 | 0 | 1 | 0 |
| 2102 | 2103 | 25 | -1 | 81 | 92647 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 1 |
| 2430 | 2431 | 23 | -1 | 73 | 92120 | 4 | 2.60 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2466 | 2467 | 24 | -2 | 80 | 94105 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2545 | 2546 | 25 | -1 | 39 | 94720 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2618 | 2619 | 23 | -3 | 55 | 92704 | 3 | 2.40 | 2 | 145 | 0 | 0 | 0 | 1 | 0 |
| 2717 | 2718 | 23 | -2 | 45 | 95422 | 4 | 0.60 | 2 | 0 | 0 | 0 | 0 | 1 | 1 |
| 2848 | 2849 | 24 | -1 | 78 | 94720 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2876 | 2877 | 24 | -2 | 80 | 91107 | 2 | 1.60 | 3 | 238 | 0 | 0 | 0 | 0 | 0 |
| 2962 | 2963 | 23 | -2 | 81 | 91711 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2980 | 2981 | 25 | -1 | 53 | 94305 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3076 | 3077 | 29 | -1 | 62 | 92672 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3130 | 3131 | 23 | -2 | 82 | 92152 | 2 | 1.80 | 2 | 0 | 0 | 1 | 0 | 0 | 1 |
| 3157 | 3158 | 23 | -1 | 13 | 94720 | 4 | 1.00 | 1 | 84 | 0 | 0 | 0 | 1 | 0 |
| 3279 | 3280 | 26 | -1 | 44 | 94901 | 1 | 2.00 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3284 | 3285 | 25 | -1 | 101 | 95819 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3292 | 3293 | 25 | -1 | 13 | 95616 | 4 | 0.40 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3394 | 3395 | 25 | -1 | 113 | 90089 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3425 | 3426 | 23 | -1 | 12 | 91605 | 4 | 1.00 | 1 | 90 | 0 | 0 | 0 | 1 | 0 |
| 3626 | 3627 | 24 | -3 | 28 | 90089 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3796 | 3797 | 24 | -2 | 50 | 94920 | 3 | 2.40 | 2 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3824 | 3825 | 23 | -1 | 12 | 95064 | 4 | 1.00 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 3887 | 3888 | 24 | -2 | 118 | 92634 | 2 | 7.20 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 3946 | 3947 | 25 | -1 | 40 | 93117 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4015 | 4016 | 25 | -1 | 139 | 93106 | 2 | 2.00 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4088 | 4089 | 29 | -1 | 71 | 94801 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4116 | 4117 | 24 | -2 | 135 | 90065 | 2 | 7.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4285 | 4286 | 23 | -3 | 149 | 93555 | 2 | 7.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4411 | 4412 | 23 | -2 | 75 | 90291 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 1 | 1 |
| 4481 | 4482 | 25 | -2 | 35 | 95045 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4514 | 4515 | 24 | -3 | 41 | 91768 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4582 | 4583 | 25 | -1 | 69 | 92691 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4957 | 4958 | 29 | -1 | 50 | 95842 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
print(
"There are ",
lmData[lmData["Experience"] < 0].shape[0],
"rows with negative experience.",
)
negExp = lmData[lmData["Experience"] < 0]
print(negExp["Personal_Loan"].value_counts())
print(negExp["Age"].value_counts())
There are 52 rows with negative experience. 0 52 Name: Personal_Loan, dtype: int64 25 18 24 17 23 12 29 3 28 1 26 1 Name: Age, dtype: int64
# Let's look at the overall distribution of Personal_Loan
print(lmData["Personal_Loan"].value_counts())
0 4520 1 480 Name: Personal_Loan, dtype: int64
# Let's delete the rows with negative experience
lmData.drop(lmData[lmData["Experience"] < 0].index, inplace=True)
# Let's see if there's any significant change in the attributes.
lmData.describe(include="all").T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 4948.0 | 2501.268795 | 1443.277676 | 1.0 | 1254.75 | 2497.5 | 3750.25 | 5000.0 |
| Age | 4948.0 | 45.557195 | 11.320735 | 24.0 | 36.00 | 46.0 | 55.00 | 67.0 |
| Experience | 4948.0 | 20.331043 | 11.311973 | 0.0 | 10.75 | 20.0 | 30.00 | 43.0 |
| Income | 4948.0 | 73.814470 | 46.112596 | 8.0 | 39.00 | 64.0 | 98.00 | 224.0 |
| ZIPCode | 4948.0 | 93168.503436 | 1761.076366 | 90005.0 | 91911.00 | 93437.0 | 94608.00 | 96651.0 |
| Family | 4948.0 | 2.391471 | 1.148444 | 1.0 | 1.00 | 2.0 | 3.00 | 4.0 |
| CCAvg | 4948.0 | 1.935926 | 1.747694 | 0.0 | 0.70 | 1.5 | 2.60 | 10.0 |
| Education | 4948.0 | 1.878941 | 0.839745 | 1.0 | 1.00 | 2.0 | 3.00 | 3.0 |
| Mortgage | 4948.0 | 56.634398 | 101.828885 | 0.0 | 0.00 | 0.0 | 101.00 | 635.0 |
| Personal_Loan | 4948.0 | 0.097009 | 0.296000 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Securities_Account | 4948.0 | 0.104285 | 0.305660 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| CD_Account | 4948.0 | 0.061035 | 0.239418 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Online | 4948.0 | 0.597009 | 0.490549 | 0.0 | 0.00 | 1.0 | 1.00 | 1.0 |
| CreditCard | 4948.0 | 0.294058 | 0.455664 | 0.0 | 0.00 | 0.0 | 1.00 | 1.0 |
Compared to earlier, these values are virtually unchanged. So we can proceed.
# Let's drop the IO field
lmData = lmData.drop(["ID"], axis=1)
lmData.head()
| Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
lmData.columns
Index(['Age', 'Experience', 'Income', 'ZIPCode', 'Family', 'CCAvg',
'Education', 'Mortgage', 'Personal_Loan', 'Securities_Account',
'CD_Account', 'Online', 'CreditCard'],
dtype='object')
for col in [
"Family",
"Education",
"Personal_Loan",
"Securities_Account",
"CD_Account",
"Online",
"CreditCard",
]:
print("Column = " + col)
print(lmData[col].value_counts())
print("")
Column = Family 1 1470 2 1274 4 1203 3 1001 Name: Family, dtype: int64 Column = Education 1 2080 3 1481 2 1387 Name: Education, dtype: int64 Column = Personal_Loan 0 4468 1 480 Name: Personal_Loan, dtype: int64 Column = Securities_Account 0 4432 1 516 Name: Securities_Account, dtype: int64 Column = CD_Account 0 4646 1 302 Name: CD_Account, dtype: int64 Column = Online 1 2954 0 1994 Name: Online, dtype: int64 Column = CreditCard 0 3493 1 1455 Name: CreditCard, dtype: int64
# Next, let's do a univariate analysis of the attributes, especially those we identified earlier as interesting based on the pandas profile report.
# We will also look at the outliers as part of this. We will deal with blanks after this initial analysis
lmData.head()
| Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
# We will use the utility functions provided in the reference solution of Project 1.
# While doing uni-variate analysis of numerical variables we want to study their central tendency
# and dispersion.
# Let us write a function that will help us create boxplot and histogram for any input numerical
# variable.
# This function takes the numerical column as the input and returns the boxplots
# and histograms for the variable.
# Let us see if this help us write faster and cleaner code.
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to show the density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# function to create labeled barplots
# This code is also from the reference solution of the previous project.
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1] # sort by >50k category
tab1 = (
pd.crosstab(data[predictor], data[target], margins=True, normalize="index")
.sort_values(by=sorter, ascending=False)
.round(4)
* 100
)
print(tab1)
print("-" * 50)
tab = (
pd.crosstab(data[predictor], data[target], normalize="index")
.sort_values(by=sorter, ascending=False)
.round(4)
* 100
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
# plt.legend(
# loc="lower left", frameon=True,
# )
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
# functions to treat outliers by flooring and capping
def treat_outliers(df, col):
"""
Treats outliers in a variable
df: dataframe
col: dataframe column
"""
Q1 = df[col].quantile(0.25) # 25th quantile
Q3 = df[col].quantile(0.75) # 75th quantile
IQR = Q3 - Q1
Lower_Whisker = Q1 - 1.5 * IQR
Upper_Whisker = Q3 + 1.5 * IQR
# all the values smaller than Lower_Whisker will be assigned the value of Lower_Whisker
# all the values greater than Upper_Whisker will be assigned the value of Upper_Whisker
df[col] = np.clip(
df[col], Lower_Whisker, Upper_Whisker
) # given an interval, clip the values
return df
# Calls the above function for all the columns passed.
def treat_outliers_all(df, col_list):
"""
Treat outliers in a list of variables
df: dataframe
col_list: list of dataframe columns
"""
for c in col_list:
df = treat_outliers(df, c)
return df
histogram_boxplot(lmData, "Age")
histogram_boxplot(lmData, "Experience")
histogram_boxplot(lmData, "Income")
log_income = pd.DataFrame(np.log(lmData["Income"]))
histogram_boxplot(log_income, "Income")
# Store log of Income in a new attribute. Since Income is > 0 for all rows, we can safely apply the log function
lmData["logIncome"] = np.log(lmData["Income"])
# Let's look at the newly added attribute.
lmData.head()
| Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | logIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 3.891820 |
| 1 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 3.526361 |
| 2 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2.397895 |
| 3 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 4.605170 |
| 4 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 3.806662 |
# Let's drop the original Income attribute.
lmData = lmData.drop(["Income"], axis=1)
histogram_boxplot(lmData, "ZIPCode")
# We will look at bar plots for categorical columns
labeled_barplot(lmData, "Family", perc=True)
histogram_boxplot(lmData, "CCAvg")
# Add a 1 before apply the log function. This avoids the error, and the 0 cases stay 0 (since log of 1 is 0.)
log_CCAvg = pd.DataFrame(np.log(1 + lmData["CCAvg"]))
histogram_boxplot(log_CCAvg, "CCAvg")
# Let's add the log attribute to the data set.
lmData["logCCAvg"] = np.log(1 + lmData["CCAvg"])
# Let's confirm the distribution.
histogram_boxplot(lmData, "logCCAvg")
lmData = lmData.drop(["CCAvg"], axis=1)
# Let's take a look at the dataset as it looks now.
lmData.head()
| Age | Experience | ZIPCode | Family | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | logIncome | logCCAvg | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 91107 | 4 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 3.891820 | 0.955511 |
| 1 | 45 | 19 | 90089 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 3.526361 | 0.916291 |
| 2 | 39 | 15 | 94720 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2.397895 | 0.693147 |
| 3 | 35 | 9 | 94112 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 4.605170 | 1.308333 |
| 4 | 35 | 8 | 91330 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 3.806662 | 0.693147 |
# We will look at bar plots for categorical columns
labeled_barplot(lmData, "Education", perc=True)
histogram_boxplot(lmData, "Mortgage")
histogram_boxplot(lmData[lmData["Mortgage"] > 0], "Mortgage")
# As earlier, we add a 1 before doing the log.
log_Mortgage = pd.DataFrame(np.log(1 + lmData["Mortgage"]))
histogram_boxplot(log_Mortgage[log_Mortgage["Mortgage"] > 0], "Mortgage")
# Let's add the log attribute and drop the original Mortgage attribute
lmData["logMortgage"] = np.log(1 + lmData["Mortgage"])
lmData = lmData.drop(["Mortgage"], axis=1)
# Let's confirm the columns
lmData.head()
| Age | Experience | ZIPCode | Family | Education | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | logIncome | logCCAvg | logMortgage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 91107 | 4 | 1 | 0 | 1 | 0 | 0 | 0 | 3.891820 | 0.955511 | 0.0 |
| 1 | 45 | 19 | 90089 | 3 | 1 | 0 | 1 | 0 | 0 | 0 | 3.526361 | 0.916291 | 0.0 |
| 2 | 39 | 15 | 94720 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 2.397895 | 0.693147 | 0.0 |
| 3 | 35 | 9 | 94112 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 4.605170 | 1.308333 | 0.0 |
| 4 | 35 | 8 | 91330 | 4 | 2 | 0 | 0 | 0 | 0 | 1 | 3.806662 | 0.693147 | 0.0 |
# We will look at bar plots for categorical columns
labeled_barplot(lmData, "Personal_Loan", perc=True)
# We will look at bar plots for categorical columns
labeled_barplot(lmData, "Securities_Account", perc=True)
# We will look at bar plots for categorical columns
labeled_barplot(lmData, "CD_Account", perc=True)
# We will look at bar plots for categorical columns
labeled_barplot(lmData, "Online", perc=True)
# We will look at bar plots for categorical columns
labeled_barplot(lmData, "CreditCard", perc=True)
# Let's see the correlation numbers.
lmData.corr()
| Age | Experience | ZIPCode | Family | Education | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | logIncome | logCCAvg | logMortgage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1.000000 | 0.994101 | -0.030790 | -0.039279 | 0.046222 | -0.014204 | 0.000496 | 0.003261 | 0.013542 | 0.007496 | -0.059096 | -0.048398 | -0.013510 |
| Experience | 0.994101 | 1.000000 | -0.030525 | -0.045610 | 0.018243 | -0.014121 | -0.000457 | 0.005450 | 0.013518 | 0.008876 | -0.052617 | -0.049159 | -0.012552 |
| ZIPCode | -0.030790 | -0.030525 | 1.000000 | 0.026883 | -0.008165 | -0.002848 | 0.001013 | 0.021881 | 0.031030 | 0.023778 | -0.028680 | -0.012558 | 0.006774 |
| Family | -0.039279 | -0.045610 | 0.026883 | 1.000000 | 0.064032 | 0.063088 | 0.020155 | 0.015274 | 0.008466 | 0.012905 | -0.120491 | -0.081371 | 0.000216 |
| Education | 0.046222 | 0.018243 | -0.008165 | 0.064032 | 1.000000 | 0.138339 | -0.007508 | 0.014639 | -0.013932 | -0.012604 | -0.163965 | -0.104818 | -0.005136 |
| Personal_Loan | -0.014204 | -0.014121 | -0.002848 | 0.063088 | 0.138339 | 1.000000 | 0.022216 | 0.315769 | 0.006175 | 0.002776 | 0.405971 | 0.337875 | 0.050668 |
| Securities_Account | 0.000496 | -0.000457 | 0.001013 | 0.020155 | -0.007508 | 0.022216 | 1.000000 | 0.319056 | 0.016101 | -0.017030 | 0.001394 | 0.015520 | -0.001092 |
| CD_Account | 0.003261 | 0.005450 | 0.021881 | 0.015274 | 0.014639 | 0.315769 | 0.319056 | 1.000000 | 0.176768 | 0.280151 | 0.133428 | 0.121679 | 0.050065 |
| Online | 0.013542 | 0.013518 | 0.031030 | 0.008466 | -0.013932 | 0.006175 | 0.016101 | 0.176768 | 1.000000 | 0.008457 | 0.015393 | -0.005824 | -0.006969 |
| CreditCard | 0.007496 | 0.008876 | 0.023778 | 0.012905 | -0.012604 | 0.002776 | -0.017030 | 0.280151 | 0.008457 | 1.000000 | -0.007165 | -0.005827 | -0.003779 |
| logIncome | -0.059096 | -0.052617 | -0.028680 | -0.120491 | -0.163965 | 0.405971 | 0.001394 | 0.133428 | 0.015393 | -0.007165 | 1.000000 | 0.594219 | 0.028516 |
| logCCAvg | -0.048398 | -0.049159 | -0.012558 | -0.081371 | -0.104818 | 0.337875 | 0.015520 | 0.121679 | -0.005824 | -0.005827 | 0.594219 | 1.000000 | 0.009717 |
| logMortgage | -0.013510 | -0.012552 | 0.006774 | 0.000216 | -0.005136 | 0.050668 | -0.001092 | 0.050065 | -0.006969 | -0.003779 | 0.028516 | 0.009717 | 1.000000 |
# Let's see the above information as a heatmap showing the correlation between various numerical attributes
plt.figure(figsize=(15, 7))
sns.heatmap(lmData.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
# Let's see the correlation between various numerical attributes in the original dataset before applying any log function.
plt.figure(figsize=(15, 7))
sns.heatmap(lmdata_orig.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
# Let's see the pairwise relationship look like for different Personal_Loan
sns.pairplot(data=lmData, hue="Personal_Loan", diag_kind="kde")
plt.show()
# Let's see the pairwise relationship look like for different Personal_Loan
sns.pairplot(
data=lmData[
[
"Personal_Loan",
"Age",
"Experience",
"ZIPCode",
"Family",
"Education",
"Securities_Account",
]
],
hue="Personal_Loan",
diag_kind="kde",
)
plt.show()
# Let's see the pairwise relationship look like for different Personal_Loan for another set of attributes
sns.pairplot(
data=lmData[
[
"Personal_Loan",
"CD_Account",
"Online",
"CreditCard",
"logIncome",
"logCCAvg",
"logMortgage",
]
],
hue="Personal_Loan",
diag_kind="kde",
)
plt.show()
# Let's see the pairwise relationship look like for different Personal_Loan. This time with another set of attributes.
sns.pairplot(
data=lmData[
[
"Personal_Loan",
"Age",
"Experience",
"ZIPCode",
"CD_Account",
"Online",
"CreditCard",
]
],
hue="Personal_Loan",
diag_kind="kde",
)
plt.show()
# Let's see the pairwise relationship look like for different Personal_Loan. Another combination of attributes chosen this time.
sns.pairplot(
data=lmData[
[
"Personal_Loan",
"logIncome",
"logCCAvg",
"logMortgage",
"Family",
"Education",
"Securities_Account",
]
],
hue="Personal_Loan",
diag_kind="kde",
)
plt.show()
# Let's see the pairwise relationship look like for different Personal_Loan. Another combination of attributes.
sns.pairplot(
data=lmData[
["Personal_Loan", "logIncome", "logCCAvg", "logMortgage", "Age", "Experience"]
],
hue="Personal_Loan",
diag_kind="kde",
)
plt.show()
# Age vs Experience
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["Age"], lmData["Experience"], palette="PuBu")
plt.show()
# Age vs ZIPCode
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["Age"], lmData["ZIPCode"], palette="PuBu")
plt.show()
# Age vs Family
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["Family"], lmData["Age"], palette="PuBu")
plt.show()
stacked_barplot(lmData, "Age", "Family")
Family 1 2 3 4 Age 24 36.36 0.00 36.36 27.27 66 33.33 16.67 33.33 16.67 65 31.25 16.25 32.50 20.00 43 28.19 21.48 32.21 18.12 61 26.23 24.59 29.51 19.67 42 34.13 22.22 29.37 14.29 45 26.77 25.98 29.13 18.11 39 27.82 31.58 27.82 12.78 60 29.92 26.77 27.56 15.75 62 32.52 24.39 26.02 17.07 28 24.51 20.59 25.49 29.41 56 22.96 25.93 25.19 25.93 57 26.52 27.27 25.00 21.21 37 19.81 33.02 24.53 22.64 55 17.60 32.00 24.00 26.40 41 35.29 30.88 21.32 12.50 58 24.48 30.07 20.98 24.48 27 24.18 19.78 20.88 35.16 All 29.71 25.75 20.23 24.31 34 32.84 18.66 20.15 28.36 35 23.84 24.50 19.87 31.79 44 29.75 26.45 19.83 23.97 26 15.58 31.17 19.48 33.77 64 28.21 32.05 19.23 20.51 46 28.35 29.92 18.90 22.83 48 39.83 21.19 18.64 20.34 52 34.48 24.83 17.24 23.45 36 23.36 26.17 16.82 33.64 47 40.71 20.35 16.81 22.12 40 27.20 34.40 16.80 21.60 67 16.67 25.00 16.67 41.67 32 28.33 27.50 16.67 27.50 54 27.97 25.87 16.08 30.07 59 28.03 28.79 15.91 27.27 33 40.00 18.33 15.83 25.83 29 25.83 21.67 15.83 36.67 49 40.87 18.26 15.65 25.22 38 22.61 36.52 14.78 26.09 30 30.15 22.06 14.71 33.09 53 25.00 30.36 14.29 30.36 25 17.14 34.29 14.29 34.29 51 44.19 20.93 13.95 20.93 63 27.78 35.19 12.96 24.07 50 47.83 18.84 9.42 23.91 31 37.60 26.40 7.20 28.80 --------------------------------------------------
# Age vs Education
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["Education"], lmData["Age"], palette="PuBu")
plt.show()
# Age vs Securities_Account
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["Securities_Account"], lmData["Age"], palette="PuBu")
plt.show()
# Age vs CD_Account
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["CD_Account"], lmData["Age"], palette="PuBu")
plt.show()
# Age vs Online
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["Online"], lmData["Age"], palette="PuBu")
plt.show()
# Age vs CreditCard
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["CreditCard"], lmData["Age"], palette="PuBu")
plt.show()
# Experience vs Family size
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["Family"], lmData["Experience"], palette="PuBu")
plt.show()
# Education vs Experience
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["Education"], lmData["Experience"], palette="PuBu")
plt.show()
# Experience vs Personal_Loan
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["Personal_Loan"], lmData["Experience"], palette="PuBu")
plt.show()
# Experience vs Securities_Account
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["Securities_Account"], lmData["Experience"], palette="PuBu")
plt.show()
# CD_Account vs Experience
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["CD_Account"], lmData["Experience"], palette="PuBu")
plt.show()
# Experience vs Online
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["Online"], lmData["Experience"], palette="PuBu")
plt.show()
# Experience vs CreditCard
plt.figure(figsize=(15, 7))
sns.boxplot(lmData["CreditCard"], lmData["Experience"], palette="PuBu")
plt.show()
# Family vs Personal_Loan
plt.figure(figsize=(15, 7))
sns.countplot(x="Family", data=lmData, palette="rainbow", hue="Personal_Loan")
plt.legend(loc="upper right")
plt.title("Count of customers of different family sizes, Separated by Personal_Loan")
Text(0.5, 1.0, 'Count of customers of different family sizes, Separated by Personal_Loan')
# Let's see the above information as percentage stacked bars.
stacked_barplot(lmData, "Family", "Personal_Loan")
Personal_Loan 0 1 Family 3 86.71 13.29 4 88.86 11.14 All 90.30 9.70 2 91.68 8.32 1 92.72 7.28 --------------------------------------------------
# Education vs Personal_Loan
plt.figure(figsize=(15, 7))
sns.countplot(x="Education", data=lmData, palette="rainbow", hue="Personal_Loan")
plt.legend(loc="upper right")
plt.title("Count of customers of different Education, Separated by Personal_Loan")
Text(0.5, 1.0, 'Count of customers of different Education, Separated by Personal_Loan')
# Let's see the above information as percentage stacked bars.
stacked_barplot(lmData, "Education", "Personal_Loan")
Personal_Loan 0 1 Education 3 86.16 13.84 2 86.88 13.12 All 90.30 9.70 1 95.53 4.47 --------------------------------------------------
# Securities_Account vs Personal_Loan
plt.figure(figsize=(15, 7))
sns.countplot(
x="Securities_Account", data=lmData, palette="rainbow", hue="Personal_Loan"
)
plt.legend(loc="upper right")
plt.title(
"Count of customers of different Securities_Account, Separated by Personal_Loan"
)
Text(0.5, 1.0, 'Count of customers of different Securities_Account, Separated by Personal_Loan')
# Let's see the above information as percentage stacked bars.
stacked_barplot(lmData, "Securities_Account", "Personal_Loan")
Personal_Loan 0 1 Securities_Account 1 88.37 11.63 All 90.30 9.70 0 90.52 9.48 --------------------------------------------------
# CD_Account vs Personal_Loan
plt.figure(figsize=(15, 7))
sns.countplot(x="CD_Account", data=lmData, palette="rainbow", hue="Personal_Loan")
plt.legend(loc="upper right")
plt.title("Count of customers of different CD_Account, Separated by Personal_Loan")
Text(0.5, 1.0, 'Count of customers of different CD_Account, Separated by Personal_Loan')
# Let's see the above information as percentage stacked bars.
stacked_barplot(lmData, "CD_Account", "Personal_Loan")
Personal_Loan 0 1 CD_Account 1 53.64 46.36 All 90.30 9.70 0 92.68 7.32 --------------------------------------------------
# Online vs Personal_Loan
plt.figure(figsize=(15, 7))
sns.countplot(x="Online", data=lmData, palette="rainbow", hue="Personal_Loan")
plt.legend(loc="upper right")
plt.title("Count of customers of different Online, Separated by Personal_Loan")
Text(0.5, 1.0, 'Count of customers of different Online, Separated by Personal_Loan')
# Let's see the above information as percentage stacked bars.
stacked_barplot(lmData, "Online", "Personal_Loan")
Personal_Loan 0 1 Online 1 90.15 9.85 All 90.30 9.70 0 90.52 9.48 --------------------------------------------------
# CreditCard vs Personal_Loan
plt.figure(figsize=(15, 7))
sns.countplot(x="CreditCard", data=lmData, palette="rainbow", hue="Personal_Loan")
plt.legend(loc="upper right")
plt.title("Count of customers of different CreditCard, Separated by Personal_Loan")
Text(0.5, 1.0, 'Count of customers of different CreditCard, Separated by Personal_Loan')
stacked_barplot(lmData, "Personal_Loan", "CreditCard")
CreditCard 0 1 Personal_Loan 1 70.21 29.79 All 70.59 29.41 0 70.64 29.36 --------------------------------------------------
# Let's now look at a few (three) attributes together.
plt.figure(figsize=(15, 7))
sns.lineplot(lmData["Age"], lmData["logIncome"], hue=lmData["Personal_Loan"], ci=0)
plt.legend(bbox_to_anchor=(1.00, 1))
plt.show()
plt.figure(figsize=(15, 7))
sns.lineplot(lmData["Age"], lmData["logCCAvg"], hue=lmData["Personal_Loan"], ci=0)
plt.legend(bbox_to_anchor=(1.00, 1))
plt.show()
plt.figure(figsize=(15, 7))
sns.lineplot(lmData["Age"], lmData["logMortgage"], hue=lmData["Personal_Loan"], ci=0)
plt.legend(bbox_to_anchor=(1.00, 1))
plt.show()
# Same analysis as earlier, but against ZIPCode instead of Age.
plt.figure(figsize=(15, 7))
sns.lineplot(lmData["ZIPCode"], lmData["logIncome"], hue=lmData["Personal_Loan"], ci=0)
plt.legend(bbox_to_anchor=(1.00, 1))
plt.show()
plt.figure(figsize=(15, 7))
sns.lineplot(lmData["ZIPCode"], lmData["logCCAvg"], hue=lmData["Personal_Loan"], ci=0)
plt.legend(bbox_to_anchor=(1.00, 1))
plt.show()
plt.figure(figsize=(15, 7))
sns.lineplot(
lmData["ZIPCode"], lmData["logMortgage"], hue=lmData["Personal_Loan"], ci=0
)
plt.legend(bbox_to_anchor=(1.00, 1))
plt.show()
# We will enrich the dataset by adding a few attributes based on the zip code. For this we will use the uszipcodes package.
def set_zip_info(curSearchEngine, curLoc, lmData):
try:
z = curSearchEngine.by_zipcode(lmData["ZIPCode"][curLoc])
lmData["zipcode_type"][curloc] = z.zipcode_type
lmData["major_city"][curloc] = z.major_city
lmData["post_office_city"][curloc] = z.post_office_city
lmData["common_city_list"][curloc] = z.common_city_list
lmData["county"][curloc] = z.county
lmData["state"][curloc] = z.state
lmData["lat"][curloc] = z.lat
lmData["lng"][curloc] = z.lng
lmData["timezone"][curloc] = z.timezone
lmData["radius_in_miles"][curloc] = z.radius_in_miles
lmData["area_code_list"][curloc] = z.area_code_list
lmData["population"][curloc] = z.population
lmData["population_density"][curloc] = z.population_density
lmData["land_area_in_sqmi"][curloc] = z.land_area_in_sqmi
lmData["water_area_in_sqmi"][curloc] = z.water_area_in_sqmi
lmData["housing_units"][curloc] = z.housing_units
lmData["occupied_housing_units"][curloc] = z.occupied_housing_units
lmData["median_home_value"][curloc] = z.median_home_value
lmData["median_household_income"][curloc] = z.median_household_income
lmData["bounds_west"][curloc] = z.bounds_west
lmData["bounds_east"][curloc] = z.bounds_east
lmData["bounds_north"][curloc] = z.bounds_north
lmData["bounds_south"][curloc] = z.bounds_south
except:
None
# First, let's add the attributes and set them to nan. The list of attributes that are added are based on uszipcodes documentation.
lmData["zipcode_type"] = np.nan
lmData["major_city"] = np.nan
lmData["post_office_city"] = np.nan
lmData["common_city_list"] = np.nan
lmData["county"] = np.nan
lmData["state"] = np.nan
lmData["lat"] = np.nan
lmData["lng"] = np.nan
lmData["timezone"] = np.nan
lmData["radius_in_miles"] = np.nan
lmData["area_code_list"] = np.nan
lmData["population"] = np.nan
lmData["population_density"] = np.nan
lmData["land_area_in_sqmi"] = np.nan
lmData["water_area_in_sqmi"] = np.nan
lmData["housing_units"] = np.nan
lmData["occupied_housing_units"] = np.nan
lmData["median_home_value"] = np.nan
lmData["median_household_income"] = np.nan
lmData["bounds_west"] = np.nan
lmData["bounds_east"] = np.nan
lmData["bounds_north"] = np.nan
lmData["bounds_south"] = np.nan
# Now, let's call above the function to fill in the newly added attributes. Print a message every 500 rows, so that we know we are making progress.
sr = SearchEngine()
for curloc in range(lmData.shape[0]):
set_zip_info(sr, curloc, lmData)
if curloc % 500 == 0:
print("Completed row #", curloc)
Completed row # 0 Completed row # 500 Completed row # 1000 Completed row # 1500 Completed row # 2000 Completed row # 2500 Completed row # 3000 Completed row # 3500 Completed row # 4000 Completed row # 4500
# Let's take a look at the new attributes. Scroll to the righ to see them.
pd.options.display.max_columns = 50
lmData.head()
| Age | Experience | ZIPCode | Family | Education | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | logIncome | logCCAvg | logMortgage | zipcode_type | major_city | post_office_city | common_city_list | county | state | lat | lng | timezone | radius_in_miles | area_code_list | population | population_density | land_area_in_sqmi | water_area_in_sqmi | housing_units | occupied_housing_units | median_home_value | median_household_income | bounds_west | bounds_east | bounds_north | bounds_south | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 91107 | 4 | 1 | 0 | 1 | 0 | 0 | 0 | 3.891820 | 0.955511 | 0.0 | STANDARD | Pasadena | Pasadena, CA | [Pasadena] | Los Angeles County | CA | 34.16 | -118.08 | America/Los_Angeles | 9.000000 | 626 | 32940.0 | 4008.0 | 8.22 | 0.08 | 13763.0 | 13028.0 | 633100.0 | 80936.0 | -118.113219 | -118.065616 | 34.193413 | 34.126840 |
| 1 | 45 | 19 | 90089 | 3 | 1 | 0 | 1 | 0 | 0 | 0 | 3.526361 | 0.916291 | 0.0 | UNIQUE | Los Angeles | Los Angeles, CA | [Los Angeles] | Los Angeles County | CA | 34.02 | -118.29 | America/Los_Angeles | 0.454545 | 213 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -118.291538 | -118.203489 | 34.062396 | 34.017410 |
| 2 | 39 | 15 | 94720 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 2.397895 | 0.693147 | 0.0 | UNIQUE | Berkeley | Berkeley, CA | [Berkeley, Uc Berkeley] | Alameda County | CA | 37.87 | -122.25 | America/Los_Angeles | 0.738636 | 510 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -122.266367 | -122.244865 | 37.881363 | 37.865429 |
| 3 | 35 | 9 | 94112 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 4.605170 | 1.308333 | 0.0 | STANDARD | San Francisco | San Francisco, CA | [San Francisco] | San Francisco County | CA | 37.72 | -122.44 | America/Los_Angeles | 2.000000 | 510,415,650 | 79407.0 | 23606.0 | 3.36 | 0.00 | 23081.0 | 21975.0 | 602400.0 | 71625.0 | -122.468939 | -122.415459 | 37.736372 | 37.708131 |
| 4 | 35 | 8 | 91330 | 4 | 2 | 0 | 0 | 0 | 0 | 1 | 3.806662 | 0.693147 | 0.0 | UNIQUE | Northridge | Northridge, CA | [Northridge] | Los Angeles County | CA | 34.25 | -118.53 | America/Los_Angeles | 0.681818 | 213 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -118.531719 | -118.523213 | 34.257339 | 34.238950 |
# Looks good so far. But how many of them are blanks.
lmData.isnull().sum()
Age 0 Experience 0 ZIPCode 0 Family 0 Education 0 Personal_Loan 0 Securities_Account 0 CD_Account 0 Online 0 CreditCard 0 logIncome 0 logCCAvg 0 logMortgage 0 zipcode_type 84 major_city 84 post_office_city 633 common_city_list 84 county 84 state 84 lat 84 lng 84 timezone 84 radius_in_miles 633 area_code_list 84 population 1046 population_density 1046 land_area_in_sqmi 1046 water_area_in_sqmi 1046 housing_units 1046 occupied_housing_units 1046 median_home_value 1046 median_household_income 1046 bounds_west 633 bounds_east 633 bounds_north 633 bounds_south 633 dtype: int64
# Quite a few, unfortunately. Let's look at the distribution of values in some of these categorical attributes.
cols = [
"zipcode_type",
"major_city",
"post_office_city",
"common_city_list",
"county",
"state",
"timezone",
"area_code_list",
]
for col in cols:
print("Column: ", col)
print(lmData[col].value_counts())
print("")
Column: zipcode_type
STANDARD 4005
UNIQUE 658
PO BOX 201
Name: zipcode_type, dtype: int64
Column: major_city
Los Angeles 366
San Diego 266
San Francisco 252
Berkeley 234
Sacramento 144
...
Stinson Beach 1
Sausalito 1
Sierra Madre 1
Tahoe City 1
Ladera Ranch 1
Name: major_city, Length: 244, dtype: int64
Column: post_office_city
Los Angeles, CA 355
Berkeley, CA 222
San Francisco, CA 212
San Diego, CA 211
Sacramento, CA 131
...
Stinson Beach, CA 1
Sierra Madre, CA 1
Ladera Ranch, CA 1
Sausalito, CA 1
Tahoe City, CA 1
Name: post_office_city, Length: 241, dtype: int64
Column: common_city_list
[San Diego] 266
[San Francisco] 252
[Los Angeles] 194
[Berkeley, Uc Berkeley] 163
[Sacramento] 144
...
[Ladera Ranch, Mission Viejo] 1
[Sausalito, Muir Beach] 1
[Stinson Beach] 1
[San Mateo, Foster City] 1
[Sierra Madre] 1
Name: common_city_list, Length: 287, dtype: int64
Column: county
Los Angeles County 1071
San Diego County 563
Santa Clara County 554
Alameda County 492
Orange County 325
San Francisco County 252
San Mateo County 201
Sacramento County 180
Santa Barbara County 150
Yolo County 129
Monterey County 126
Ventura County 111
San Bernardino County 99
Contra Costa County 83
Santa Cruz County 67
Riverside County 55
Kern County 52
Marin County 51
Solano County 33
San Luis Obispo County 32
Humboldt County 32
Sonoma County 28
Fresno County 26
Placer County 24
Butte County 19
Shasta County 18
El Dorado County 16
Stanislaus County 15
San Joaquin County 12
San Benito County 10
Mendocino County 8
Siskiyou County 7
Tuolumne County 6
Trinity County 4
Merced County 4
Imperial County 3
Napa County 3
Lake County 3
Name: county, dtype: int64
Column: state
CA 4864
Name: state, dtype: int64
Column: timezone
America/Los_Angeles 4864
Name: timezone, dtype: int64
Column: area_code_list
650 466
510 395
805 290
213 203
415 193
...
213,310,323 2
510.0 1
626.0 1
310,323,714,562 1
213.0 1
Name: area_code_list, Length: 88, dtype: int64
# Having seen the distribution of categorical attributes, let's see the boxplot and histogram of numerical attributes.
cols = [
"lat",
"lng",
"radius_in_miles",
"population",
"population_density",
"land_area_in_sqmi",
"water_area_in_sqmi",
"housing_units",
"occupied_housing_units",
"median_home_value",
"median_household_income",
"bounds_west",
"bounds_east",
"bounds_north",
"bounds_south",
]
for col in cols:
histogram_boxplot(lmData[lmData[col].notnull()], col)
print("")
# Let's do a count plot of Personal_Loan for each county (one of the new attributes).
plt.figure(figsize=(15, 7))
mychart = sns.countplot(x="county", data=lmData, palette="rainbow", hue="Personal_Loan")
mychart.set_xticklabels(mychart.get_xticklabels(), rotation=-70)
plt.legend(loc="upper right")
plt.title("Count of customers of different family sizes, Separated by Personal_Loan")
Text(0.5, 1.0, 'Count of customers of different family sizes, Separated by Personal_Loan')
stacked_barplot(lmData, "county", "Personal_Loan")
Personal_Loan 0 1 county Sonoma County 78.57 21.43 Shasta County 83.33 16.67 San Luis Obispo County 84.38 15.62 Contra Costa County 85.54 14.46 Kern County 86.54 13.46 Santa Clara County 87.18 12.82 Mendocino County 87.50 12.50 Santa Cruz County 88.06 11.94 Monterey County 88.10 11.90 Marin County 88.24 11.76 Riverside County 89.09 10.91 Butte County 89.47 10.53 San Diego County 89.52 10.48 Los Angeles County 89.73 10.27 Ventura County 90.09 9.91 All 90.23 9.77 Orange County 90.77 9.23 Solano County 90.91 9.09 Alameda County 91.06 8.94 Sacramento County 91.67 8.33 Placer County 91.67 8.33 San Joaquin County 91.67 8.33 Fresno County 92.31 7.69 San Francisco County 92.46 7.54 Santa Barbara County 92.67 7.33 Stanislaus County 93.33 6.67 Humboldt County 93.75 6.25 Yolo County 93.80 6.20 San Mateo County 94.03 5.97 San Bernardino County 96.97 3.03 El Dorado County 100.00 0.00 Tuolumne County 100.00 0.00 Trinity County 100.00 0.00 Imperial County 100.00 0.00 Siskiyou County 100.00 0.00 Lake County 100.00 0.00 Merced County 100.00 0.00 Napa County 100.00 0.00 San Benito County 100.00 0.00 --------------------------------------------------
# Fix the outliers (based on IQR)
numerical_col = ["median_household_income", "population", "population_density"]
data = treat_outliers_all(lmData, numerical_col)
# Let's look at the data types
lmData.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4948 entries, 0 to 4999 Data columns (total 36 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 4948 non-null int64 1 Experience 4948 non-null int64 2 ZIPCode 4948 non-null int64 3 Family 4948 non-null int64 4 Education 4948 non-null int64 5 Personal_Loan 4948 non-null int64 6 Securities_Account 4948 non-null int64 7 CD_Account 4948 non-null int64 8 Online 4948 non-null int64 9 CreditCard 4948 non-null int64 10 logIncome 4948 non-null float64 11 logCCAvg 4948 non-null float64 12 logMortgage 4948 non-null float64 13 zipcode_type 4864 non-null object 14 major_city 4864 non-null object 15 post_office_city 4315 non-null object 16 common_city_list 4864 non-null object 17 county 4864 non-null object 18 state 4864 non-null object 19 lat 4864 non-null float64 20 lng 4864 non-null float64 21 timezone 4864 non-null object 22 radius_in_miles 4315 non-null float64 23 area_code_list 4864 non-null object 24 population 3902 non-null float64 25 population_density 3902 non-null float64 26 land_area_in_sqmi 3902 non-null float64 27 water_area_in_sqmi 3902 non-null float64 28 housing_units 3902 non-null float64 29 occupied_housing_units 3902 non-null float64 30 median_home_value 3902 non-null float64 31 median_household_income 3902 non-null float64 32 bounds_west 4315 non-null float64 33 bounds_east 4315 non-null float64 34 bounds_north 4315 non-null float64 35 bounds_south 4315 non-null float64 dtypes: float64(18), int64(10), object(8) memory usage: 1.6+ MB
# Among the new attributes, some have very high cardinality and we will drop them.
lmData = lmData.drop(
[
"zipcode_type",
"major_city",
"post_office_city",
"common_city_list",
"county",
"state",
"timezone",
"area_code_list",
],
axis=1,
)
# Now let's look at the data one more time.
lmData.head()
| Age | Experience | ZIPCode | Family | Education | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | logIncome | logCCAvg | logMortgage | lat | lng | radius_in_miles | population | population_density | land_area_in_sqmi | water_area_in_sqmi | housing_units | occupied_housing_units | median_home_value | median_household_income | bounds_west | bounds_east | bounds_north | bounds_south | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 91107 | 4 | 1 | 0 | 1 | 0 | 0 | 0 | 3.891820 | 0.955511 | 0.0 | 34.16 | -118.08 | 9.000000 | 32940.0 | 4008.0 | 8.22 | 0.08 | 13763.0 | 13028.0 | 633100.0 | 80936.0 | -118.113219 | -118.065616 | 34.193413 | 34.126840 |
| 1 | 45 | 19 | 90089 | 3 | 1 | 0 | 1 | 0 | 0 | 0 | 3.526361 | 0.916291 | 0.0 | 34.02 | -118.29 | 0.454545 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -118.291538 | -118.203489 | 34.062396 | 34.017410 |
| 2 | 39 | 15 | 94720 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 2.397895 | 0.693147 | 0.0 | 37.87 | -122.25 | 0.738636 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -122.266367 | -122.244865 | 37.881363 | 37.865429 |
| 3 | 35 | 9 | 94112 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 4.605170 | 1.308333 | 0.0 | 37.72 | -122.44 | 2.000000 | 79407.0 | 16030.5 | 3.36 | 0.00 | 23081.0 | 21975.0 | 602400.0 | 71625.0 | -122.468939 | -122.415459 | 37.736372 | 37.708131 |
| 4 | 35 | 8 | 91330 | 4 | 2 | 0 | 0 | 0 | 0 | 1 | 3.806662 | 0.693147 | 0.0 | 34.25 | -118.53 | 0.681818 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -118.531719 | -118.523213 | 34.257339 | 34.238950 |
# Look at the data types
lmData.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4948 entries, 0 to 4999 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 4948 non-null int64 1 Experience 4948 non-null int64 2 ZIPCode 4948 non-null int64 3 Family 4948 non-null int64 4 Education 4948 non-null int64 5 Personal_Loan 4948 non-null int64 6 Securities_Account 4948 non-null int64 7 CD_Account 4948 non-null int64 8 Online 4948 non-null int64 9 CreditCard 4948 non-null int64 10 logIncome 4948 non-null float64 11 logCCAvg 4948 non-null float64 12 logMortgage 4948 non-null float64 13 lat 4864 non-null float64 14 lng 4864 non-null float64 15 radius_in_miles 4315 non-null float64 16 population 3902 non-null float64 17 population_density 3902 non-null float64 18 land_area_in_sqmi 3902 non-null float64 19 water_area_in_sqmi 3902 non-null float64 20 housing_units 3902 non-null float64 21 occupied_housing_units 3902 non-null float64 22 median_home_value 3902 non-null float64 23 median_household_income 3902 non-null float64 24 bounds_west 4315 non-null float64 25 bounds_east 4315 non-null float64 26 bounds_north 4315 non-null float64 27 bounds_south 4315 non-null float64 dtypes: float64(18), int64(10) memory usage: 1.3 MB
# Let's see the distribution of the target attribute.
n_true = len(lmData.loc[lmData["Personal_Loan"] == 1])
n_false = len(lmData.loc[lmData["Personal_Loan"] == 0])
print(
"Number of true cases: {0} ({1:2.2f}%)".format(
n_true, (n_true / (n_true + n_false)) * 100
)
)
print(
"Number of false cases: {0} ({1:2.2f}%)".format(
n_false, (n_false / (n_true + n_false)) * 100
)
)
Number of true cases: 480 (9.70%) Number of false cases: 4468 (90.30%)
# Set up X and Y and then split up into train and test datasets.
X = lmData.drop("Personal_Loan", axis=1) # Predictor feature columns
Y = lmData["Personal_Loan"] # Predicted class (1=True, 0=False) (1 X m)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number
x_train.head()
| Age | Experience | ZIPCode | Family | Education | Securities_Account | CD_Account | Online | CreditCard | logIncome | logCCAvg | logMortgage | lat | lng | radius_in_miles | population | population_density | land_area_in_sqmi | water_area_in_sqmi | housing_units | occupied_housing_units | median_home_value | median_household_income | bounds_west | bounds_east | bounds_north | bounds_south | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3395 | 41 | 16 | 94061 | 2 | 2 | 0 | 0 | 0 | 0 | 3.555348 | 0.875469 | 4.912655 | 37.46 | -122.24 | 2.0 | 36245.0 | 9391.0 | 3.86 | 0.00 | 14010.0 | 13436.0 | 777900.0 | 79833.0 | -122.269864 | -122.211288 | 37.482088 | 37.443557 |
| 370 | 36 | 12 | 92101 | 4 | 1 | 0 | 0 | 0 | 0 | 3.218876 | 0.693147 | 4.997212 | 32.71 | -117.16 | 2.0 | 37095.0 | 7867.0 | 4.72 | 0.21 | 25169.0 | 20599.0 | 448700.0 | 52550.0 | -117.216513 | -117.142667 | 32.742536 | 32.696912 |
| 2337 | 43 | 16 | 95054 | 1 | 2 | 0 | 0 | 0 | 1 | 5.303305 | 2.397895 | 0.000000 | 37.39 | -121.96 | 2.0 | 23364.0 | 3730.0 | 6.26 | 0.00 | 8891.0 | 8447.0 | 605900.0 | 110099.0 | -121.987117 | -121.935639 | 37.418922 | 37.373520 |
| 3341 | 35 | 9 | 91125 | 2 | 1 | 1 | 0 | 1 | 0 | 3.496508 | 0.262364 | 0.000000 | 34.13 | -118.12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3602 | 47 | 21 | 95841 | 4 | 1 | 0 | 0 | 1 | 1 | 3.737670 | 0.095310 | 0.000000 | 38.66 | -121.35 | 2.0 | 19448.0 | 4771.0 | 4.08 | 0.00 | 8804.0 | 7957.0 | 171200.0 | 36967.0 | -121.382008 | -121.326387 | 38.691852 | 38.640407 |
# Check the split.
print(
"{0:0.2f}% data is in training set".format((len(x_train) / len(lmData.index)) * 100)
)
print("{0:0.2f}% data is in test set".format((len(x_test) / len(lmData.index)) * 100))
69.99% data is in training set 30.01% data is in test set
# Check the target attribute distribution in train and test sets.
print(
"Original Personal_Loan True Values : {0} ({1:0.2f}%)".format(
len(lmData.loc[lmData["Personal_Loan"] == 1]),
(len(lmData.loc[lmData["Personal_Loan"] == 1]) / len(lmData.index)) * 100,
)
)
print(
"Original Personal_Loan False Values : {0} ({1:0.2f}%)".format(
len(lmData.loc[lmData["Personal_Loan"] == 0]),
(len(lmData.loc[lmData["Personal_Loan"] == 0]) / len(lmData.index)) * 100,
)
)
print("")
print(
"Training Personal_Loan True Values : {0} ({1:0.2f}%)".format(
len(y_train[y_train[:] == 1]),
(len(y_train[y_train[:] == 1]) / len(y_train)) * 100,
)
)
print(
"Training Personal_Loan False Values : {0} ({1:0.2f}%)".format(
len(y_train[y_train[:] == 0]),
(len(y_train[y_train[:] == 0]) / len(y_train)) * 100,
)
)
print("")
print(
"Test Personal_Loan True Values : {0} ({1:0.2f}%)".format(
len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1]) / len(y_test)) * 100
)
)
print(
"Test Personal_Loan False Values : {0} ({1:0.2f}%)".format(
len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0]) / len(y_test)) * 100
)
)
print("")
Original Personal_Loan True Values : 480 (9.70%) Original Personal_Loan False Values : 4468 (90.30%) Training Personal_Loan True Values : 335 (9.67%) Training Personal_Loan False Values : 3128 (90.33%) Test Personal_Loan True Values : 145 (9.76%) Test Personal_Loan False Values : 1340 (90.24%)
# Let's check the missing cases.
lmData.isnull().sum()
Age 0 Experience 0 ZIPCode 0 Family 0 Education 0 Personal_Loan 0 Securities_Account 0 CD_Account 0 Online 0 CreditCard 0 logIncome 0 logCCAvg 0 logMortgage 0 lat 84 lng 84 radius_in_miles 633 population 1046 population_density 1046 land_area_in_sqmi 1046 water_area_in_sqmi 1046 housing_units 1046 occupied_housing_units 1046 median_home_value 1046 median_household_income 1046 bounds_west 633 bounds_east 633 bounds_north 633 bounds_south 633 dtype: int64
# We have missing cases. Hence, we will get the imputer to run on the relevant columns.
rep_0 = SimpleImputer(missing_values=np.nan, strategy="mean")
cols = [
"lat",
"lng",
"radius_in_miles",
"population",
"population_density",
"land_area_in_sqmi",
"water_area_in_sqmi",
"housing_units",
"occupied_housing_units",
"median_home_value",
"median_household_income",
"bounds_west",
"bounds_east",
"bounds_north",
"bounds_south",
]
x_train[cols] = rep_0.fit_transform(x_train[cols])
x_test[cols] = rep_0.transform(x_test[cols])
x_test.head(10)
| Age | Experience | ZIPCode | Family | Education | Securities_Account | CD_Account | Online | CreditCard | logIncome | logCCAvg | logMortgage | lat | lng | radius_in_miles | population | population_density | land_area_in_sqmi | water_area_in_sqmi | housing_units | occupied_housing_units | median_home_value | median_household_income | bounds_west | bounds_east | bounds_north | bounds_south | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2376 | 58 | 33 | 91768 | 3 | 1 | 0 | 0 | 0 | 0 | 3.135494 | 0.182322 | 0.000000 | 34.06 | -117.79 | 3.000000 | 34537.000000 | 3993.000000 | 8.650000 | 0.010000 | 8381.000000 | 7885.000000 | 2.606000e+05 | 47228.000000 | -117.834298 | -117.750183 | 34.095848 | 34.030908 |
| 3641 | 59 | 35 | 94402 | 4 | 3 | 0 | 0 | 0 | 0 | 4.304065 | 1.193922 | 0.000000 | 37.52 | -122.35 | 3.000000 | 23981.000000 | 4934.000000 | 4.860000 | 0.000000 | 9804.000000 | 9422.000000 | 9.892000e+05 | 107124.000000 | -122.361134 | -122.296325 | 37.573163 | 37.507196 |
| 3774 | 51 | 26 | 92521 | 4 | 3 | 0 | 0 | 1 | 0 | 3.951244 | 1.029619 | 0.000000 | 33.97 | -117.34 | 3.979354 | 34177.783126 | 5382.403367 | 21.521881 | 0.578206 | 13466.515007 | 12596.465227 | 6.017051e+05 | 77967.815886 | -120.184149 | -120.092057 | 35.923754 | 35.850462 |
| 4143 | 55 | 31 | 94720 | 2 | 1 | 0 | 0 | 1 | 0 | 2.995732 | 0.262364 | 0.000000 | 37.87 | -122.25 | 0.738636 | 34177.783126 | 5382.403367 | 21.521881 | 0.578206 | 13466.515007 | 12596.465227 | 6.017051e+05 | 77967.815886 | -122.266367 | -122.244865 | 37.881363 | 37.865429 |
| 250 | 30 | 6 | 94305 | 3 | 2 | 0 | 0 | 0 | 0 | 3.367296 | 0.693147 | 4.770685 | 37.41 | -122.17 | 2.000000 | 13862.000000 | 2703.000000 | 5.130000 | 0.050000 | 4020.000000 | 3933.000000 | 1.000001e+06 | 64697.000000 | -122.194331 | -122.149240 | 37.443630 | 37.392201 |
| 2949 | 37 | 11 | 95054 | 3 | 2 | 0 | 0 | 1 | 0 | 2.944439 | 0.182322 | 0.000000 | 37.39 | -121.96 | 2.000000 | 23364.000000 | 3730.000000 | 6.260000 | 0.000000 | 8891.000000 | 8447.000000 | 6.059000e+05 | 110099.000000 | -121.987117 | -121.935639 | 37.418922 | 37.373520 |
| 3735 | 40 | 14 | 91103 | 1 | 1 | 0 | 0 | 1 | 0 | 4.356709 | 1.824549 | 0.000000 | 34.17 | -118.17 | 2.000000 | 27480.000000 | 5122.000000 | 5.370000 | 0.060000 | 9076.000000 | 8492.000000 | 5.200000e+05 | 59682.000000 | -118.190970 | -118.145419 | 34.201583 | 34.145760 |
| 1540 | 34 | 8 | 91320 | 4 | 1 | 1 | 1 | 1 | 1 | 2.397895 | 0.262364 | 0.000000 | 34.17 | -118.95 | 4.000000 | 44274.000000 | 2426.000000 | 18.250000 | 0.000000 | 15438.000000 | 15076.000000 | 5.766000e+05 | 104665.000000 | -119.003810 | -118.885558 | 34.215885 | 34.135933 |
| 2372 | 34 | 10 | 93943 | 3 | 1 | 0 | 0 | 1 | 1 | 3.806662 | 1.335001 | 5.036953 | 36.60 | -121.87 | 0.284091 | 34177.783126 | 5382.403367 | 21.521881 | 0.578206 | 13466.515007 | 12596.465227 | 6.017051e+05 | 77967.815886 | -121.878880 | -121.867327 | 36.600058 | 36.593689 |
| 4500 | 50 | 26 | 94305 | 4 | 2 | 0 | 0 | 0 | 1 | 3.178054 | 0.405465 | 0.000000 | 37.41 | -122.17 | 2.000000 | 13862.000000 | 2703.000000 | 5.130000 | 0.050000 | 4020.000000 | 3933.000000 | 1.000001e+06 | 64697.000000 | -122.194331 | -122.149240 | 37.443630 | 37.392201 |
# Fit the LR model on train. As suggested in the project FAQ, we will use the newton-cg solver.
model = LogisticRegression(solver="newton-cg", random_state=1)
model2 = model.fit(x_train, y_train)
# predict on test
y_predict = model.predict(x_test)
coef_df = pd.DataFrame(model.coef_)
coef_df["intercept"] = model.intercept_
print(coef_df)
0 1 2 3 4 5 6 \
0 -0.095338 0.102075 -0.000118 0.562371 1.417641 -0.208146 2.339185
7 8 9 10 11 12 13 \
0 -0.446762 -0.706139 4.931946 0.939848 0.066047 0.015567 0.036175
14 15 16 17 18 19 20 \
0 -0.065738 -0.000008 -0.000021 0.00355 -0.027434 0.000046 -0.000037
21 22 23 24 25 26 intercept
0 5.345755e-07 -0.000004 0.078955 0.125547 0.15386 0.1785 -0.029781
# let us check the coefficients and intercept of the model
coef_df = pd.DataFrame(
np.append(model.coef_, model.intercept_),
index=x_train.columns.tolist() + ["Intercept"],
columns=["Coefficients"],
)
coef_df.T
| Age | Experience | ZIPCode | Family | Education | Securities_Account | CD_Account | Online | CreditCard | logIncome | logCCAvg | logMortgage | lat | lng | radius_in_miles | population | population_density | land_area_in_sqmi | water_area_in_sqmi | housing_units | occupied_housing_units | median_home_value | median_household_income | bounds_west | bounds_east | bounds_north | bounds_south | Intercept | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Coefficients | -0.095338 | 0.102075 | -0.000118 | 0.562371 | 1.417641 | -0.208146 | 2.339185 | -0.446762 | -0.706139 | 4.931946 | 0.939848 | 0.066047 | 0.015567 | 0.036175 | -0.065738 | -0.000008 | -0.000021 | 0.00355 | -0.027434 | 0.000046 | -0.000037 | 5.345755e-07 | -0.000004 | 0.078955 | 0.125547 | 0.15386 | 0.1785 | -0.029781 |
# Let's print the coefficents in decreasing order (ignoring the signs)
abs(coef_df).sort_values(by="Coefficients", ascending=False).T
| logIncome | CD_Account | Education | logCCAvg | CreditCard | Family | Online | Securities_Account | bounds_south | bounds_north | bounds_east | Experience | Age | bounds_west | logMortgage | radius_in_miles | lng | Intercept | water_area_in_sqmi | lat | land_area_in_sqmi | ZIPCode | housing_units | occupied_housing_units | population_density | population | median_household_income | median_home_value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Coefficients | 4.931946 | 2.339185 | 1.417641 | 0.939848 | 0.706139 | 0.562371 | 0.446762 | 0.208146 | 0.1785 | 0.15386 | 0.125547 | 0.102075 | 0.095338 | 0.078955 | 0.066047 | 0.065738 | 0.036175 | 0.029781 | 0.027434 | 0.015567 | 0.00355 | 0.000118 | 0.000046 | 0.000037 | 0.000021 | 0.000008 | 0.000004 | 5.345755e-07 |
# The functions below display the model information and are taken from the code shared in the weekly sessions.
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn_with_threshold(
model, predictors, target, threshold=0.5
):
"""
Function to compute different metrics, based on the threshold specified, to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
# predicting using the independent variables
pred_prob = model.predict_proba(predictors)[:, 1]
pred_thres = pred_prob > threshold
pred = np.round(pred_thres)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
# defining a function to plot the confusion_matrix of a classification model built using sklearn
def confusion_matrix_sklearn_with_threshold(model, predictors, target, threshold=0.5):
"""
To plot the confusion_matrix, based on the threshold specified, with percentages
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
pred_prob = model.predict_proba(predictors)[
:, 1
] # gives probabilities for both classes
pred_thres = pred_prob > threshold
y_pred = np.round(pred_thres)
cm = confusion_matrix(target, y_pred) # from sklearn
labels = np.asarray(
[
[
"{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())
] # flatten will reshape
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="", cmap="PuBuGn")
plt.ylabel("Actual")
plt.xlabel("Predicted")
# Display the confusion matrix (train)
confusion_matrix_sklearn_with_threshold(model, x_train, y_train)
# Display the model performance information (train)
log_reg_model_train_perf = model_performance_classification_sklearn_with_threshold(
model, x_train, y_train
)
print("Training performance:")
log_reg_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.954952 | 0.626866 | 0.871369 | 0.729167 |
# Display the confusion matrix (test)
confusion_matrix_sklearn_with_threshold(model, x_test, y_test)
# Display the model performance information (test)
log_reg_model_test_perf = model_performance_classification_sklearn_with_threshold(
model, x_test, y_test
)
print("Test set performance:")
log_reg_model_test_perf
Test set performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.956902 | 0.634483 | 0.893204 | 0.741935 |
# Let's plot the ROC curve (training dataset)
# roc_curve returns fpr : array, shape = [>2]
# Increasing false positive rates such that element i is the false
# positive rate of predictions with score >= thresholds[i].
# tpr : array, shape = [>2]
# Increasing true positive rates such that element i is the true
# positive rate of predictions with score >= thresholds[i].
# thresholds : array, shape = [n_thresholds]
# Decreasing thresholds on the decision function used to compute
# fpr and tpr.
logit_roc_auc_train = roc_auc_score(y_train, model.predict_proba(x_train)[:, 1])
fpr, tpr, thresholds = roc_curve(y_train, model.predict_proba(x_train)[:, 1])
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label="Logistic Regression (area = %0.2f)" % logit_roc_auc_train)
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic Curve")
plt.legend(loc="lower right")
plt.show()
# Let's plot the ROC curve (test dataset)
logit_roc_auc_test = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(x_test)[:, 1])
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label="Logistic Regression (area = %0.2f)" % logit_roc_auc_test)
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
# Optimal threshold as per AUC-ROC curve
# The optimal cut off would be where tpr is high and fpr is low
fpr, tpr, thresholds = roc_curve(y_train, model.predict_proba(x_train)[:, 1])
optimal_idx = np.argmax(
tpr - fpr
) # returns max of all [tpr-fpr] for various thresholds
optimal_threshold_auc_roc = thresholds[optimal_idx]
print(optimal_threshold_auc_roc)
0.11010231013109878
# creating confusion matrix (train) for the optimal threshold
confusion_matrix_sklearn_with_threshold(
model, x_train, y_train, threshold=optimal_threshold_auc_roc
)
# checking model performance for this model
log_reg_model_train_perf_threshold_auc_roc = (
model_performance_classification_sklearn_with_threshold(
model, x_train, y_train, threshold=optimal_threshold_auc_roc
)
)
print("Training performance:")
log_reg_model_train_perf_threshold_auc_roc
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.885937 | 0.922388 | 0.455752 | 0.610069 |
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
model, x_test, y_test, threshold=optimal_threshold_auc_roc
)
# checking model performance for this model
log_reg_model_test_perf_threshold_auc_roc = (
model_performance_classification_sklearn_with_threshold(
model, x_test, y_test, threshold=optimal_threshold_auc_roc
)
)
print("Test set performance:")
log_reg_model_test_perf_threshold_auc_roc
Test set performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.896296 | 0.924138 | 0.483755 | 0.635071 |
y_scores = model.predict_proba(x_train)[:, 1]
prec, rec, tre = precision_recall_curve(
y_train,
y_scores,
)
## precision_recall_curve returns:
# precision : array,
# Precision values such that element i is the precision of
# predictions with score >= thresholds[i] and the last element is 1.
# recall : array,
# Decreasing recall values such that element i is the recall of
# predictions with score >= thresholds[i] and the last element is 0.
# thresholds : array, shape = [n_thresholds <= len(np.unique(probas_pred))]
# Increasing thresholds on the decision function used to compute
# precision and recall.
# Function code taken from the notebook shared during the weekly sessions
def plot_prec_recall_vs_tresh(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="precision")
plt.plot(thresholds, recalls[:-1], "g--", label="recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plt.figure(figsize=(10, 7))
plot_prec_recall_vs_tresh(prec, rec, tre)
plt.show()
# setting the threshold to the optimal value
optimal_threshold_curve = (
0.35 # From above, the crossing point, where precision and recall both are high.
)
# creating confusion matrix for the optimal value of threshold
confusion_matrix_sklearn_with_threshold(
model, x_train, y_train, threshold=optimal_threshold_curve
)
# Training data performance for the optimal value of threshold
log_reg_model_train_perf_threshold_curve = (
model_performance_classification_sklearn_with_threshold(
model, x_train, y_train, threshold=optimal_threshold_curve
)
)
print("Training performance:")
log_reg_model_train_perf_threshold_curve
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.949177 | 0.728358 | 0.741641 | 0.73494 |
# creating confusion matrix for test data with the optimal value of threshold
confusion_matrix_sklearn_with_threshold(
model, x_test, y_test, threshold=optimal_threshold_curve
)
# test data performance of the model for the optimal value of threshold
log_reg_model_test_perf_threshold_curve = (
model_performance_classification_sklearn_with_threshold(
model, x_test, y_test, threshold=optimal_threshold_curve
)
)
print("Test set performance:")
log_reg_model_test_perf_threshold_curve
Test set performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.952189 | 0.717241 | 0.776119 | 0.74552 |
# So far we have considered ALL the features.
# Now we will optimize on the number of features.
# Fit the model on train
model = LogisticRegression(solver="newton-cg", n_jobs=-1, random_state=1, max_iter=100)
# we will first build model with all combination of varaibles.
# Scoring will be based on f1 measure, which considers both precision and recall. Verbose=0 suppresses extra output.
sfs = SFS(
model,
k_features=x_train.shape[1],
forward=True,
floating=False,
scoring="f1",
verbose=0,
cv=3,
n_jobs=-1,
)
sfs = sfs.fit(x_train, y_train)
# Let's plot the output of above search job.
fig1 = plot_sfs(sfs.get_metric_dict(), kind="std_dev", figsize=(12, 5))
plt.ylim([0.1, 1])
plt.title("Sequential Forward Selection (w. StdDev)")
plt.xticks(rotation=90)
plt.show()
# We now limit the search space of models to those with 19 features.
sfs1 = SFS(
model,
k_features=19,
forward=True,
floating=False,
scoring="f1",
verbose=0,
cv=3,
n_jobs=-1,
)
sfs1 = sfs1.fit(x_train, y_train)
fig1 = plot_sfs(sfs1.get_metric_dict(), kind="std_dev", figsize=(10, 5))
plt.ylim([0.1, 1])
plt.title("Sequential Forward Selection (w. StdDev)")
plt.grid()
plt.show()
# Let's see which 19 features were chosen for the best model.
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)
[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 16, 17, 18, 19, 20, 23, 25, 26]
# The above were the index. Let's print the column names.
x_train.columns[feat_cols]
Index(['Experience', 'ZIPCode', 'Family', 'Education', 'Securities_Account',
'CD_Account', 'Online', 'CreditCard', 'logIncome', 'logMortgage', 'lat',
'population_density', 'land_area_in_sqmi', 'water_area_in_sqmi',
'housing_units', 'occupied_housing_units', 'bounds_west',
'bounds_north', 'bounds_south'],
dtype='object')
# Now we will use these features to build the model.
x_train_final = x_train[x_train.columns[feat_cols]]
# Creating new x_test with the same variables that we selected for x_train
x_test_final = x_test[x_train_final.columns]
# Fitting logistic regression model
logreg = LogisticRegression(
solver="newton-cg", penalty="none", verbose=True, n_jobs=-1, random_state=0
)
# There are several optimizer, we are using optimizer called as 'newton-cg'
logreg.fit(x_train_final, y_train)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 out of 1 | elapsed: 0.3s finished
LogisticRegression(n_jobs=-1, penalty='none', random_state=0,
solver='newton-cg', verbose=True)
# Display the confusion matrix for the optimal model (train).
confusion_matrix_sklearn_with_threshold(logreg, x_train_final, y_train)
# Display the performance for the optimal model (train).
log_reg_model_train_perf_SFS = model_performance_classification_sklearn_with_threshold(
logreg, x_train_final, y_train
)
print("Training performance:")
log_reg_model_train_perf_SFS
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.956974 | 0.671642 | 0.852273 | 0.751252 |
# Display the confusion matrix for the optimal model (test).
confusion_matrix_sklearn_with_threshold(logreg, x_test_final, y_test)
# Display the performance for the optimal model (test).
log_reg_model_test_perf_SFS = model_performance_classification_sklearn_with_threshold(
logreg, x_test_final, y_test
)
print("Test set performance:")
log_reg_model_test_perf_SFS
Test set performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.958249 | 0.689655 | 0.854701 | 0.763359 |
# Let's look at the performance of the first model we obtained, before any optimization.
log_reg_model_train_perf
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.954952 | 0.626866 | 0.871369 | 0.729167 |
# training performance comparison of all cases so far.
models_train_comp_df = pd.concat(
[
log_reg_model_train_perf.T,
log_reg_model_train_perf_threshold_auc_roc.T,
log_reg_model_train_perf_threshold_curve.T,
log_reg_model_train_perf_SFS.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Logistic Regression sklearn",
"Logistic Regression-0.11 Threshold",
"Logistic Regression-0.35 Threshold",
"Logistic Regression - SFS",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Logistic Regression sklearn | Logistic Regression-0.11 Threshold | Logistic Regression-0.35 Threshold | Logistic Regression - SFS | |
|---|---|---|---|---|
| Accuracy | 0.954952 | 0.885937 | 0.949177 | 0.956974 |
| Recall | 0.626866 | 0.922388 | 0.728358 | 0.671642 |
| Precision | 0.871369 | 0.455752 | 0.741641 | 0.852273 |
| F1 | 0.729167 | 0.610069 | 0.734940 | 0.751252 |
# testing performance comparison for all cases so far.
models_test_comp_df = pd.concat(
[
log_reg_model_test_perf.T,
log_reg_model_test_perf_threshold_auc_roc.T,
log_reg_model_test_perf_threshold_curve.T,
log_reg_model_test_perf_SFS.T,
],
axis=1,
)
models_test_comp_df.columns = [
"Logistic Regression sklearn",
"Logistic Regression-0.11 Threshold",
"Logistic Regression-0.35 Threshold",
"Logistic Regression - SFS",
]
print("Test set performance comparison:")
models_test_comp_df
Test set performance comparison:
| Logistic Regression sklearn | Logistic Regression-0.11 Threshold | Logistic Regression-0.35 Threshold | Logistic Regression - SFS | |
|---|---|---|---|---|
| Accuracy | 0.956902 | 0.896296 | 0.952189 | 0.958249 |
| Recall | 0.634483 | 0.924138 | 0.717241 | 0.689655 |
| Precision | 0.893204 | 0.483755 | 0.776119 | 0.854701 |
| F1 | 0.741935 | 0.635071 | 0.745520 | 0.763359 |
lmData.head()
| Age | Experience | ZIPCode | Family | Education | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | logIncome | logCCAvg | logMortgage | lat | lng | radius_in_miles | population | population_density | land_area_in_sqmi | water_area_in_sqmi | housing_units | occupied_housing_units | median_home_value | median_household_income | bounds_west | bounds_east | bounds_north | bounds_south | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 91107 | 4 | 1 | 0 | 1 | 0 | 0 | 0 | 3.891820 | 0.955511 | 0.0 | 34.16 | -118.08 | 9.000000 | 32940.0 | 4008.0 | 8.22 | 0.08 | 13763.0 | 13028.0 | 633100.0 | 80936.0 | -118.113219 | -118.065616 | 34.193413 | 34.126840 |
| 1 | 45 | 19 | 90089 | 3 | 1 | 0 | 1 | 0 | 0 | 0 | 3.526361 | 0.916291 | 0.0 | 34.02 | -118.29 | 0.454545 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -118.291538 | -118.203489 | 34.062396 | 34.017410 |
| 2 | 39 | 15 | 94720 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 2.397895 | 0.693147 | 0.0 | 37.87 | -122.25 | 0.738636 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -122.266367 | -122.244865 | 37.881363 | 37.865429 |
| 3 | 35 | 9 | 94112 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 4.605170 | 1.308333 | 0.0 | 37.72 | -122.44 | 2.000000 | 79407.0 | 16030.5 | 3.36 | 0.00 | 23081.0 | 21975.0 | 602400.0 | 71625.0 | -122.468939 | -122.415459 | 37.736372 | 37.708131 |
| 4 | 35 | 8 | 91330 | 4 | 2 | 0 | 0 | 0 | 0 | 1 | 3.806662 | 0.693147 | 0.0 | 34.25 | -118.53 | 0.681818 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -118.531719 | -118.523213 | 34.257339 | 34.238950 |
# Let's build the initial model
model = DecisionTreeClassifier(
criterion="gini", class_weight={0: 0.15, 1: 0.85}, random_state=1
)
model.fit(x_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, random_state=1)
## Function to calculate recall score
# These codes are taken from the notebooks shared in the weekly sessions.
def get_recall_score(model, predictors, target):
"""
model: classifier
predictors: independent variables
target: dependent variable
"""
prediction = model.predict(predictors)
return recall_score(target, prediction)
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
confusion_matrix_sklearn(model, x_train, y_train)
# Display the recall for training data
decision_tree_perf_train = get_recall_score(model, x_train, y_train)
print("Recall Score:", decision_tree_perf_train)
Recall Score: 1.0
# Display the confusion matrix for test data
confusion_matrix_sklearn(model, x_test, y_test)
# Display the recall for test data
decision_tree_perf_test = get_recall_score(model, x_test, y_test)
print("Recall Score:", decision_tree_perf_test)
Recall Score: 0.8206896551724138
# Let's visualize the tree
## creating a list of column names
feature_names = x_train.columns.to_list()
plt.figure(figsize=(20, 30))
out = tree.plot_tree(
model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
# below code will add arrows to the decision tree split if they are missing
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(model, feature_names=feature_names, show_weights=True))
|--- logIncome <= 4.57 | |--- logCCAvg <= 1.37 | | |--- weights: [363.00, 0.00] class: 0 | |--- logCCAvg > 1.37 | | |--- CD_Account <= 0.50 | | | |--- land_area_in_sqmi <= 0.84 | | | | |--- weights: [0.00, 3.40] class: 1 | | | |--- land_area_in_sqmi > 0.84 | | | | |--- Age <= 29.50 | | | | | |--- radius_in_miles <= 5.00 | | | | | | |--- housing_units <= 10822.26 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- housing_units > 10822.26 | | | | | | | |--- lat <= 34.75 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- lat > 34.75 | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | |--- radius_in_miles > 5.00 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- Age > 29.50 | | | | | |--- population_density <= 255.50 | | | | | | |--- logCCAvg <= 1.46 | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | |--- logCCAvg > 1.46 | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | |--- population_density > 255.50 | | | | | | |--- Age <= 64.50 | | | | | | | |--- logCCAvg <= 1.42 | | | | | | | | |--- radius_in_miles <= 0.87 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- radius_in_miles > 0.87 | | | | | | | | | |--- Age <= 36.50 | | | | | | | | | | |--- Age <= 33.00 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | | |--- Age > 33.00 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | |--- Age > 36.50 | | | | | | | | | | |--- weights: [2.85, 0.00] class: 0 | | | | | | | |--- logCCAvg > 1.42 | | | | | | | | |--- logIncome <= 4.53 | | | | | | | | | |--- weights: [13.80, 0.00] class: 0 | | | | | | | | |--- logIncome > 4.53 | | | | | | | | | |--- land_area_in_sqmi <= 4.21 | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | |--- land_area_in_sqmi > 4.21 | | | | | | | | | | |--- weights: [1.05, 0.00] class: 0 | | | | | | |--- Age > 64.50 | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | |--- CD_Account > 0.50 | | | |--- Experience <= 11.50 | | | | |--- weights: [0.15, 0.00] class: 0 | | | |--- Experience > 11.50 | | | | |--- population_density <= 12384.50 | | | | | |--- weights: [0.00, 5.95] class: 1 | | | | |--- population_density > 12384.50 | | | | | |--- weights: [0.15, 0.00] class: 0 |--- logIncome > 4.57 | |--- Education <= 1.50 | | |--- Family <= 2.50 | | | |--- logIncome <= 4.65 | | | | |--- logCCAvg <= 1.44 | | | | | |--- occupied_housing_units <= 3062.50 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- occupied_housing_units > 3062.50 | | | | | | |--- weights: [3.00, 0.00] class: 0 | | | | |--- logCCAvg > 1.44 | | | | | |--- median_home_value <= 499800.00 | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | |--- median_home_value > 499800.00 | | | | | | |--- Age <= 31.50 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Age > 31.50 | | | | | | | |--- lat <= 32.85 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- lat > 32.85 | | | | | | | | |--- weights: [0.00, 5.10] class: 1 | | | |--- logIncome > 4.65 | | | | |--- logCCAvg <= 0.09 | | | | | |--- weights: [2.10, 0.00] class: 0 | | | | |--- logCCAvg > 0.09 | | | | | |--- weights: [64.20, 0.00] class: 0 | | |--- Family > 2.50 | | | |--- logIncome <= 4.65 | | | | |--- weights: [1.35, 0.00] class: 0 | | | |--- logIncome > 4.65 | | | | |--- median_household_income <= 146968.00 | | | | | |--- housing_units <= 24569.50 | | | | | | |--- water_area_in_sqmi <= 1.47 | | | | | | | |--- ZIPCode <= 90019.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- ZIPCode > 90019.50 | | | | | | | | |--- logIncome <= 4.71 | | | | | | | | | |--- Securities_Account <= 0.50 | | | | | | | | | | |--- population_density <= 5198.20 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | | |--- population_density > 5198.20 | | | | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | | | |--- Securities_Account > 0.50 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- logIncome > 4.71 | | | | | | | | | |--- weights: [0.00, 41.65] class: 1 | | | | | | |--- water_area_in_sqmi > 1.47 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- housing_units > 24569.50 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- median_household_income > 146968.00 | | | | | |--- weights: [0.30, 0.00] class: 0 | |--- Education > 1.50 | | |--- logIncome <= 4.76 | | | |--- logCCAvg <= 1.37 | | | | |--- logIncome <= 4.67 | | | | | |--- weights: [7.05, 0.00] class: 0 | | | | |--- logIncome > 4.67 | | | | | |--- CreditCard <= 0.50 | | | | | | |--- Experience <= 8.00 | | | | | | | |--- median_household_income <= 110476.50 | | | | | | | | |--- weights: [1.80, 0.00] class: 0 | | | | | | | |--- median_household_income > 110476.50 | | | | | | | | |--- Online <= 0.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- Online > 0.50 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | |--- Experience > 8.00 | | | | | | | |--- Experience <= 30.50 | | | | | | | | |--- median_household_income <= 58043.00 | | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | | |--- median_household_income > 58043.00 | | | | | | | | | |--- median_household_income <= 102038.50 | | | | | | | | | | |--- lat <= 32.93 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | | |--- lat > 32.93 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | |--- median_household_income > 102038.50 | | | | | | | | | | |--- land_area_in_sqmi <= 10.31 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | | |--- land_area_in_sqmi > 10.31 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- Experience > 30.50 | | | | | | | | |--- population <= 20011.89 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- population > 20011.89 | | | | | | | | | |--- weights: [0.75, 0.00] class: 0 | | | | | |--- CreditCard > 0.50 | | | | | | |--- weights: [2.25, 0.00] class: 0 | | | |--- logCCAvg > 1.37 | | | | |--- land_area_in_sqmi <= 22.38 | | | | | |--- median_household_income <= 111896.00 | | | | | | |--- ZIPCode <= 90154.50 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- ZIPCode > 90154.50 | | | | | | | |--- radius_in_miles <= 0.51 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- radius_in_miles > 0.51 | | | | | | | | |--- ZIPCode <= 90389.50 | | | | | | | | | |--- logMortgage <= 5.50 | | | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | | | |--- logMortgage > 5.50 | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- ZIPCode > 90389.50 | | | | | | | | | |--- bounds_west <= -117.19 | | | | | | | | | | |--- ZIPCode <= 95052.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- ZIPCode > 95052.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- bounds_west > -117.19 | | | | | | | | | | |--- population_density <= 5520.50 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | | |--- population_density > 5520.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | |--- median_household_income > 111896.00 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- land_area_in_sqmi > 22.38 | | | | | |--- weights: [0.60, 0.00] class: 0 | | |--- logIncome > 4.76 | | | |--- logMortgage <= 2.17 | | | | |--- weights: [0.00, 124.10] class: 1 | | | |--- logMortgage > 2.17 | | | | |--- weights: [0.00, 64.60] class: 1
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
model.feature_importances_, columns=["Imp"], index=x_train.columns
).sort_values(by="Imp", ascending=False)
)
Imp logIncome 5.904380e-01 Education 1.279533e-01 Family 1.273597e-01 logCCAvg 7.384586e-02 Age 1.457484e-02 land_area_in_sqmi 1.414116e-02 CD_Account 9.703671e-03 median_household_income 7.679082e-03 Experience 6.632953e-03 population_density 6.456326e-03 CreditCard 4.552670e-03 radius_in_miles 3.898084e-03 ZIPCode 2.227230e-03 median_home_value 2.090856e-03 housing_units 1.930995e-03 lat 1.551845e-03 logMortgage 1.251305e-03 population 1.251305e-03 water_area_in_sqmi 8.322909e-04 Securities_Account 7.994448e-04 Online 7.195003e-04 bounds_west 1.095816e-04 occupied_housing_units 2.565576e-17 bounds_north 0.000000e+00 lng 0.000000e+00 bounds_east 0.000000e+00 bounds_south 0.000000e+00
importances = model.feature_importances_
indices = np.argsort(importances) # Returns the indices that would sort an array.
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="lightblue", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# Hyperparameter tuning using grid search is done next.
# Choose the type of classifier.
estimator = DecisionTreeClassifier(random_state=1, class_weight={0: 0.15, 1: 0.85})
# scikit-learn uses an optimised version of the CART algorithm;
# However, scikit-learn implementation does not support categorical variables for now.
# criterion - The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.
# splitter : {"best", "random"}, default="best" The strategy used to choose the split at each node.
# Supported strategies are "best" to choose the best split and "random" to choose the best random split.
# Using best, the model if taking the feature with the highest importance
# Using random, the model if taking the feature randomly but with the same distribution
# (in gini, if feature have an importance of 38% so it will be taken in 38% of cases)
# The weight will penalize the classifer for misclassification of the rare 1 cases (user session resulted in revenue).
# min_impurity_decrease = A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
# Grid of parameters to choose from
parameters = {
"max_depth": [5, 10, 15, None],
"criterion": ["entropy", "gini"],
"splitter": ["best", "random"],
"min_impurity_decrease": [0.00001, 0.0001, 0.01],
}
# Type of scoring used to compare parameter combinations
# make_scorer is a factory function that wraps performance metrics around each classifier
scorer = make_scorer(recall_score)
# Run the grid search
grid_obj = GridSearchCV(estimator, parameters, scoring=scorer, cv=5)
grid_obj = grid_obj.fit(x_train, y_train)
# Set the clf to the best combination of parameters
estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
estimator.fit(x_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, criterion='entropy',
max_depth=5, min_impurity_decrease=1e-05,
random_state=1)
# Let's look at the best combination of parameters.
grid_obj.best_params_
{'criterion': 'entropy',
'max_depth': 5,
'min_impurity_decrease': 1e-05,
'splitter': 'best'}
# Checking performance on training set
confusion_matrix_sklearn(estimator, x_train, y_train)
# Let's look at the recall score on training data
decision_tree_tune_perf_train = get_recall_score(estimator, x_train, y_train)
print("Recall Score:", decision_tree_tune_perf_train)
Recall Score: 0.9791044776119403
# Checking performance on test set
confusion_matrix_sklearn(estimator, x_test, y_test)
# Let's look at the recall score on test data
decision_tree_tune_perf_test = get_recall_score(estimator, x_test, y_test)
print("Recall Score:", decision_tree_tune_perf_test)
Recall Score: 0.9241379310344827
# Visualizing the Decision Tree
plt.figure(figsize=(15, 10))
out = tree.plot_tree(
estimator,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(estimator, feature_names=feature_names, show_weights=True))
|--- logIncome <= 4.53 | |--- logCCAvg <= 1.37 | | |--- weights: [355.35, 0.00] class: 0 | |--- logCCAvg > 1.37 | | |--- land_area_in_sqmi <= 0.84 | | | |--- weights: [0.00, 3.40] class: 1 | | |--- land_area_in_sqmi > 0.84 | | | |--- CD_Account <= 0.50 | | | | |--- Age <= 29.50 | | | | | |--- weights: [0.30, 2.55] class: 1 | | | | |--- Age > 29.50 | | | | | |--- weights: [16.80, 3.40] class: 0 | | | |--- CD_Account > 0.50 | | | | |--- Education <= 2.50 | | | | | |--- weights: [0.00, 3.40] class: 1 | | | | |--- Education > 2.50 | | | | | |--- weights: [0.15, 0.00] class: 0 |--- logIncome > 4.53 | |--- Education <= 1.50 | | |--- Family <= 2.50 | | | |--- logIncome <= 4.65 | | | | |--- logCCAvg <= 1.44 | | | | | |--- weights: [6.30, 0.00] class: 0 | | | | |--- logCCAvg > 1.44 | | | | | |--- weights: [1.80, 5.10] class: 1 | | | |--- logIncome > 4.65 | | | | |--- weights: [66.30, 0.00] class: 0 | | |--- Family > 2.50 | | | |--- logIncome <= 4.72 | | | | |--- CD_Account <= 0.50 | | | | | |--- weights: [3.30, 2.55] class: 0 | | | | |--- CD_Account > 0.50 | | | | | |--- weights: [0.00, 2.55] class: 1 | | | |--- logIncome > 4.72 | | | | |--- weights: [0.00, 39.95] class: 1 | |--- Education > 1.50 | | |--- logIncome <= 4.76 | | | |--- logCCAvg <= 1.37 | | | | |--- logIncome <= 4.67 | | | | | |--- weights: [10.65, 0.00] class: 0 | | | | |--- logIncome > 4.67 | | | | | |--- weights: [6.30, 7.65] class: 1 | | | |--- logCCAvg > 1.37 | | | | |--- land_area_in_sqmi <= 27.24 | | | | | |--- weights: [1.35, 25.50] class: 1 | | | | |--- land_area_in_sqmi > 27.24 | | | | | |--- weights: [0.60, 0.00] class: 0 | | |--- logIncome > 4.76 | | | |--- weights: [0.00, 188.70] class: 1
Observations from the tree:
Using the above extracted decision rules we can make interpretations from the decision tree model like:
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
estimator.feature_importances_, columns=["Imp"], index=x_train.columns
).sort_values(by="Imp", ascending=False)
)
# Here we will see that importance of features has increased
Imp logIncome 0.612956 Education 0.130230 Family 0.117067 logCCAvg 0.111841 CD_Account 0.010890 land_area_in_sqmi 0.010482 Age 0.006534 population_density 0.000000 bounds_north 0.000000 bounds_east 0.000000 bounds_west 0.000000 median_household_income 0.000000 median_home_value 0.000000 occupied_housing_units 0.000000 housing_units 0.000000 water_area_in_sqmi 0.000000 lng 0.000000 population 0.000000 radius_in_miles 0.000000 Experience 0.000000 lat 0.000000 logMortgage 0.000000 CreditCard 0.000000 Online 0.000000 Securities_Account 0.000000 ZIPCode 0.000000 bounds_south 0.000000
importances = estimator.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# Total impurity of leaves vs effective alphas of pruned tree
clf = DecisionTreeClassifier(random_state=1, class_weight={0: 0.15, 1: 0.85})
path = clf.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
pd.DataFrame(path)
| ccp_alphas | impurities | |
|---|---|---|
| 0 | 0.000000e+00 | -7.012679e-15 |
| 1 | 7.509964e-19 | -7.011928e-15 |
| 2 | 7.509964e-19 | -7.011177e-15 |
| 3 | 1.767050e-18 | -7.009410e-15 |
| 4 | 7.024025e-18 | -7.002386e-15 |
| 5 | 8.636459e-18 | -6.993750e-15 |
| 6 | 1.206012e-17 | -6.981690e-15 |
| 7 | 4.881477e-16 | -6.493542e-15 |
| 8 | 1.389343e-15 | -5.104199e-15 |
| 9 | 1.940599e-04 | 3.881198e-04 |
| 10 | 1.961825e-04 | 1.172850e-03 |
| 11 | 1.982793e-04 | 1.569409e-03 |
| 12 | 3.382187e-04 | 1.907627e-03 |
| 13 | 3.382187e-04 | 2.245846e-03 |
| 14 | 3.560197e-04 | 2.957885e-03 |
| 15 | 3.581492e-04 | 4.032333e-03 |
| 16 | 3.748536e-04 | 5.906601e-03 |
| 17 | 3.757986e-04 | 6.658198e-03 |
| 18 | 3.788050e-04 | 7.415808e-03 |
| 19 | 3.912475e-04 | 8.589551e-03 |
| 20 | 6.215912e-04 | 9.832733e-03 |
| 21 | 6.764374e-04 | 1.050917e-02 |
| 22 | 7.694824e-04 | 1.127865e-02 |
| 23 | 9.828578e-04 | 1.226151e-02 |
| 24 | 1.066358e-03 | 1.439423e-02 |
| 25 | 1.149297e-03 | 1.784212e-02 |
| 26 | 1.232082e-03 | 1.907420e-02 |
| 27 | 1.374246e-03 | 2.182269e-02 |
| 28 | 1.413867e-03 | 2.323656e-02 |
| 29 | 1.657272e-03 | 2.489383e-02 |
| 30 | 2.140094e-03 | 2.703393e-02 |
| 31 | 2.308615e-03 | 2.934254e-02 |
| 32 | 3.071947e-03 | 3.241449e-02 |
| 33 | 3.339670e-03 | 3.575416e-02 |
| 34 | 3.735992e-03 | 3.949015e-02 |
| 35 | 3.987462e-03 | 4.347761e-02 |
| 36 | 4.128003e-03 | 4.760562e-02 |
| 37 | 4.561448e-03 | 5.216706e-02 |
| 38 | 6.750001e-03 | 5.891706e-02 |
| 39 | 9.479367e-03 | 6.839643e-02 |
| 40 | 1.078090e-02 | 7.917733e-02 |
| 41 | 1.952159e-02 | 9.869892e-02 |
| 42 | 5.953034e-02 | 1.582293e-01 |
| 43 | 5.980935e-02 | 2.180386e-01 |
| 44 | 2.520358e-01 | 4.700744e-01 |
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
# Next, we train a decision tree using the effective alphas.
# The last value in ccp_alphas is the alpha value that prunes the whole tree, leaving the tree, clfs[-1], with one node.
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(
random_state=1, ccp_alpha=ccp_alpha, class_weight={0: 0.15, 1: 0.85}
)
clf.fit(x_train, y_train)
clfs.append(clf)
print(
"Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]
)
)
Number of nodes in the last tree is: 1 with ccp_alpha: 0.25203583584710687
# For the remainder, we remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.
# Here we show that the number of nodes and tree depth decreases as alpha increases.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1, figsize=(10, 7))
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
recall_train = []
for clf in clfs:
pred_train = clf.predict(x_train)
values_train = recall_score(y_train, pred_train)
recall_train.append(values_train)
recall_test = []
for clf in clfs:
pred_test = clf.predict(x_test)
values_test = recall_score(y_test, pred_test)
recall_test.append(values_test)
train_scores = [clf.score(x_train, y_train) for clf in clfs]
test_scores = [clf.score(x_test, y_test) for clf in clfs]
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha for training and testing sets")
ax.plot(
ccp_alphas,
recall_train,
marker="o",
label="train",
drawstyle="steps-post",
)
ax.plot(ccp_alphas, recall_test, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()
max(recall_test)
0.9310344827586207
# creating the model where we get highest train and test recall
index_best_model = np.argmax(recall_test)
best_model = clfs[index_best_model]
print(best_model)
DecisionTreeClassifier(ccp_alpha=0.0021400936046170466,
class_weight={0: 0.15, 1: 0.85}, random_state=1)
best_model.fit(x_train, y_train)
DecisionTreeClassifier(ccp_alpha=0.0021400936046170466,
class_weight={0: 0.15, 1: 0.85}, random_state=1)
# Confusion matrix on training data for this best model
confusion_matrix_sklearn(best_model, x_train, y_train)
# recall score on training data
decision_tree_postpruned_perf_train = get_recall_score(best_model, x_train, y_train)
print("Recall Score:", decision_tree_postpruned_perf_train)
Recall Score: 0.9880597014925373
# Confusion matrix on test data for this best model
confusion_matrix_sklearn(best_model, x_test, y_test)
# recall score on test data
decision_tree_postpruned_perf_test = get_recall_score(best_model, x_test, y_test)
print("Recall Score:", decision_tree_postpruned_perf_test)
Recall Score: 0.9310344827586207
# Visualizing the Decision Tree
plt.figure(figsize=(15, 15))
out = tree.plot_tree(
best_model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(best_model, feature_names=feature_names, show_weights=True))
|--- logIncome <= 4.57 | |--- logCCAvg <= 1.37 | | |--- weights: [363.00, 0.00] class: 0 | |--- logCCAvg > 1.37 | | |--- CD_Account <= 0.50 | | | |--- land_area_in_sqmi <= 0.84 | | | | |--- weights: [0.00, 3.40] class: 1 | | | |--- land_area_in_sqmi > 0.84 | | | | |--- Age <= 29.50 | | | | | |--- weights: [0.30, 2.55] class: 1 | | | | |--- Age > 29.50 | | | | | |--- population_density <= 255.50 | | | | | | |--- weights: [0.30, 1.70] class: 1 | | | | | |--- population_density > 255.50 | | | | | | |--- weights: [17.85, 3.40] class: 0 | | |--- CD_Account > 0.50 | | | |--- weights: [0.30, 5.95] class: 1 |--- logIncome > 4.57 | |--- Education <= 1.50 | | |--- Family <= 2.50 | | | |--- logIncome <= 4.65 | | | | |--- logCCAvg <= 1.44 | | | | | |--- weights: [3.15, 0.00] class: 0 | | | | |--- logCCAvg > 1.44 | | | | | |--- weights: [0.75, 5.10] class: 1 | | | |--- logIncome > 4.65 | | | | |--- weights: [66.30, 0.00] class: 0 | | |--- Family > 2.50 | | | |--- logIncome <= 4.65 | | | | |--- weights: [1.35, 0.00] class: 0 | | | |--- logIncome > 4.65 | | | | |--- weights: [0.90, 44.20] class: 1 | |--- Education > 1.50 | | |--- logIncome <= 4.76 | | | |--- logCCAvg <= 1.37 | | | | |--- logIncome <= 4.67 | | | | | |--- weights: [7.05, 0.00] class: 0 | | | | |--- logIncome > 4.67 | | | | | |--- weights: [6.30, 7.65] class: 1 | | | |--- logCCAvg > 1.37 | | | | |--- weights: [1.65, 22.10] class: 1 | | |--- logIncome > 4.76 | | | |--- weights: [0.00, 188.70] class: 1
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
best_model.feature_importances_, columns=["Imp"], index=x_train.columns
).sort_values(by="Imp", ascending=False)
)
Imp logIncome 0.624418 Education 0.134997 Family 0.134368 logCCAvg 0.074776 CD_Account 0.010296 land_area_in_sqmi 0.009000 Age 0.006934 population_density 0.005211 bounds_north 0.000000 bounds_east 0.000000 bounds_west 0.000000 median_household_income 0.000000 median_home_value 0.000000 occupied_housing_units 0.000000 housing_units 0.000000 water_area_in_sqmi 0.000000 lng 0.000000 population 0.000000 radius_in_miles 0.000000 Experience 0.000000 lat 0.000000 logMortgage 0.000000 CreditCard 0.000000 Online 0.000000 Securities_Account 0.000000 ZIPCode 0.000000 bounds_south 0.000000
importances = best_model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# training performance comparison
models_train_comp_df = pd.DataFrame(
[
decision_tree_perf_train,
decision_tree_tune_perf_train,
decision_tree_postpruned_perf_train,
],
columns=["Recall on training set"],
)
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Recall on training set | |
|---|---|
| 0 | 1.000000 |
| 1 | 0.979104 |
| 2 | 0.988060 |
# testing performance comparison
models_test_comp_df = pd.DataFrame(
[
decision_tree_perf_test,
decision_tree_tune_perf_test,
decision_tree_postpruned_perf_test,
],
columns=["Recall on testing set"],
)
print("Test performance comparison:")
models_test_comp_df
Test performance comparison:
| Recall on testing set | |
|---|---|
| 0 | 0.820690 |
| 1 | 0.924138 |
| 2 | 0.931034 |
According to the logistical regression model -
a) Higher the income of a customer, the more likely he or she is to taking a personal loan.
b) If a customer has a CD account, he or she is more likely to take a personal loan.
c) The next three attributes that predict whether a customer takes a personal loan are: Education level, CC balance, where he or she has a Credit Card from another bank
d) The next two important attributes are: Family size and whether the customer uses online banking.
The decision tree model also picks the same top 5 attributes. Furthermore:
a) If a (log of) customer income is >= 4.659, he or she is more likely to take a personal loan.
b) If a customer Education is <= 1, he or she is less likely to take a personal loan.
c) If (log of) CCAvg is <= 1.374, he or she is less likely to take a personal loan.
The Top 5 counties with the highest % of Personal Loan customers are: Sonoma, Shasta, San Luis Obispo, Contra Costa and Kern. Note that county information was derived from the zip code.
AllLife Bank is a growing bank. But the dataset is for California state only. Caution is warranted if applying the model on customers in other regions.
The models predict that a liability customer is more likely to buy a personal loan.
Which variables are most significant? Income, Education, Family, CCAvg, CD_Account
Which segment of customers should be targeted more? AllLife Bank should target educated customers with a higher income, bigger families, who have a CD account and who carry CC balances.